OLD | NEW |
(Empty) | |
| 1 // |
| 2 // Copyright 2006, 2007 Google Inc. All Rights Reserved. |
| 3 // Author: dsites@google.com (Dick Sites) |
| 4 // |
| 5 // Design document: eng/designdocs/i18n/compact_encoding_detector.pdf |
| 6 |
| 7 #include "encodings/compact_enc_det/compact_enc_det.h" |
| 8 |
| 9 #include <math.h> // for sqrt |
| 10 #include <stddef.h> // for size_t |
| 11 #include <stdio.h> // for printf, fprintf, NULL, etc |
| 12 #include <stdlib.h> // for qsort |
| 13 #include <string.h> // for memset, memcpy, memcmp, etc |
| 14 #include <memory> |
| 15 #include <string> // for string, operator==, etc |
| 16 |
| 17 //#include "base/basictypes.h" // for uint8, uint32, char32, etc |
| 18 //#include "base/commandlineflags.h" // for DEFINE_bool, <anonymous>, etc |
| 19 //#include "base/logging.h" // for COMPACT_GOOGLE_LOG_FATAL, etc |
| 20 //#include "base/macros.h" // for COMPILE_ASSERT, arraysize, etc |
| 21 #include "encodings/compact_enc_det/compact_enc_det_hint_code.h" |
| 22 #include "encodings/compact_lang_det/win/cld_basictypes.h" |
| 23 #include "encodings/compact_lang_det/win/cld_commandlineflags.h" |
| 24 #include "encodings/compact_lang_det/win/cld_logging.h" |
| 25 #include "encodings/compact_lang_det/win/cld_macros.h" |
| 26 |
| 27 using std::string; |
| 28 |
| 29 // TODO |
| 30 // dsites 2007.10.09 |
| 31 // |
| 32 // Consider font=TT-BHxxx as user-defined => binary |
| 33 // Demote GB18030 if no 8x3x pair |
| 34 // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires |
| 35 // Consider removing/ignoring bytes 01-1F to avoid crap pollution |
| 36 // Possibly boost declared encoding in robust scan |
| 37 // googlebot tiny files |
| 38 // look for ranges of encodings |
| 39 // consider tags just as > < within aligned block of 32 |
| 40 // flag too few characters in postproc (Latin 6 problem) |
| 41 // Remove slow scan beyond 16KB |
| 42 // Consider removing kMostLikelyEncoding or cut it in half |
| 43 |
| 44 |
| 45 // A note on mixed encodings |
| 46 // |
| 47 // The most common encoding error on the web is a page containing a mixture of |
| 48 // CP-1252 and UTF-8. A less common encoding error is a third-party feed that |
| 49 // has been converted from CP-1252 to UTF-8 and then those bytes converted a |
| 50 // second time to UTF-8. CED originally attempted to detect these error cases |
| 51 // by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended |
| 52 // implementation was to start these just below CP1252 and UTF8 respectively in |
| 53 // overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are |
| 54 // found. |
| 55 // |
| 56 // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the |
| 57 // UTF8CP1252 internal encoding was added late and not put into encodings.proto, |
| 58 // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and |
| 59 // is removed in this November 2011 CL. |
| 60 // |
| 61 // Mixed encoding detection never worked out as well as envisioned, so the |
| 62 // ced_allow_utf8utf8 flag normally disables all this. |
| 63 // |
| 64 // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as |
| 65 // UTF8, and the inputconverter code for UTF8 normally will convert bare |
| 66 // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8 |
| 67 // and double-UTF-8 mixtures will be detected as UTF-8, and the double |
| 68 // conversion will stand. |
| 69 // |
| 70 // However, it is occasionally useful to use CED to detect double-converted |
| 71 // UTF-8 coming from third-party data feeds, so they can be fixed at the source. |
| 72 // For this purpose, the UTF8UTF8 encoding remains available under the |
| 73 // ced_allow_utf8utf8 flag. |
| 74 // |
| 75 // When UTF8UTF8 is detected, the inputconverter code will undo the double |
| 76 // conversion, giving good text. |
| 77 |
| 78 // Norbert Runge has noted these words in CP1252 that are mistakenly identified |
| 79 // as UTF-8 because of the last pair of characters: |
| 80 // NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH |
| 81 // drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N |
| 82 // Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA |
| 83 // Schoß\u201c 0xDF 0x93 U+00DF U+201C |
| 84 // weiß\u201c 0xDF 0x93 U+00DF U+00AB |
| 85 // Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C |
| 86 // süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE |
| 87 // These four byte combinations now explicitly boost Latin1/CP1252. |
| 88 |
| 89 // And for reference, here are a couple of Portuguese spellings |
| 90 // that may be mistaken as double-byte encodings. |
| 91 // informações 0xE7 0xF5 |
| 92 // traição 0xE7 0xE3 |
| 93 |
| 94 |
| 95 static const char* kVersion = "2.2"; |
| 96 |
| 97 DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, " |
| 98 "to handle mixtures of CP1252 " |
| 99 "converted to UTF-8 zero, one, " |
| 100 "or two times"); |
| 101 DEFINE_int32(enc_detect_slow_max_kb, 16, |
| 102 "Maximum number of Kbytes to examine for " |
| 103 "7-bit-only (2022, Hz, UTF7) encoding detect. " |
| 104 "You are unlikely to want to change this."); |
| 105 DEFINE_int32(enc_detect_fast_max_kb, 256, |
| 106 "Maximum number of Kbytes to examine for encoding detect. " |
| 107 "You are unlikely to want to change this."); |
| 108 |
| 109 DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility " |
| 110 "difference 1st - 2nd to be considered reliable \n" |
| 111 " 2 corresponds to min 4x difference\n" |
| 112 " 4 corresponds to min 16x difference\n" |
| 113 " 8 corresponds to min 256x difference\n" |
| 114 " 10 corresponds to min 1024x difference\n" |
| 115 " 20 corresponds to min 1Mx difference."); |
| 116 |
| 117 // Text debug output options |
| 118 DEFINE_bool(enc_detect_summary, false, |
| 119 "Print first 16 interesting pairs at exit."); |
| 120 DEFINE_bool(counts, false, "Count major-section usage"); |
| 121 |
| 122 // PostScript debug output options |
| 123 DEFINE_bool(enc_detect_detail, false, |
| 124 "Print PostScript of every update, to stderr."); |
| 125 DEFINE_bool(enc_detect_detail2, false, |
| 126 "More PostScript detail of every update, to stderr."); |
| 127 DEFINE_bool(enc_detect_source, false, "Include source text in detail"); |
| 128 // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in |
| 129 // lang_enc.cc |
| 130 DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name."); |
| 131 DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name."); |
| 132 |
| 133 |
| 134 // Only for experiments. Delete soon. |
| 135 DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams"); |
| 136 |
| 137 // Demo-mode/debugging experiment |
| 138 DEFINE_bool(demo_nodefault, false, |
| 139 "Default to all equal; no boost for declared encoding."); |
| 140 DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings"); |
| 141 DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr"); |
| 142 |
| 143 |
| 144 static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10 |
| 145 static const int XLOG2 = 30; // Multiplier for log base 2 ** n |
| 146 |
| 147 static const int kFinalPruneDifference = 10 * XLOG2; |
| 148 // Final bits of minimum |
| 149 // probability difference 1st-nth |
| 150 // to be pruned |
| 151 |
| 152 static const int kInititalPruneDifference = kFinalPruneDifference * 4; |
| 153 // Initial bits of minimum |
| 154 // probability difference 1st-nth |
| 155 // to be pruned |
| 156 // |
| 157 static const int kPruneDiffDecrement = kFinalPruneDifference; |
| 158 // Decrements bits of minimum |
| 159 // probability difference 1st-nth |
| 160 // to be pruned |
| 161 |
| 162 static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum |
| 163 // probability difference, base to |
| 164 // superset encodings |
| 165 |
| 166 static const int kBoostInitial = 20 * XLOG2; // bits of boost for |
| 167 // initial byte patterns (BOM, 00) |
| 168 |
| 169 static const int kBadPairWhack = 20 * XLOG2; // bits of whack for |
| 170 // one bad pair |
| 171 |
| 172 static const int kBoostOnePair = 20 * XLOG2; // bits of boost for |
| 173 // one good pair in Hz, etc. |
| 174 |
| 175 static const int kGentleOnePair = 4 * XLOG2; // bits of boost for |
| 176 // one good sequence |
| 177 // |
| 178 static const int kGentlePairWhack = 2 * XLOG2; // bits of whack |
| 179 // for ill-formed sequence |
| 180 |
| 181 static const int kGentlePairBoost = 2 * XLOG2; // bits of boost |
| 182 // for well-formed sequence |
| 183 |
| 184 static const int kBoostPerB64Byte = 2 * XLOG2; // bits of boost for |
| 185 // one good pair in Hz, etc. |
| 186 |
| 187 static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for |
| 188 // best declared encoding per bigram |
| 189 |
| 190 static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for |
| 191 // best encoding per bigram |
| 192 |
| 193 static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri |
| 194 |
| 195 static const int kMaxPairs = 48; // Max interesting pairs to look at |
| 196 // If you change this, |
| 197 // adjust *PruneDiff* |
| 198 |
| 199 static const int kPruneMask = 0x07; // Prune every 8 interesting pairs |
| 200 |
| 201 |
| 202 static const int kBestPairsCount = 16; // For first N pairs, do extra boost |
| 203 // based on most likely encoding |
| 204 // of pair over entire web |
| 205 |
| 206 static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams, |
| 207 // weaken the hints enough that |
| 208 // unhinted encodings have a hope of |
| 209 // rising to the top |
| 210 |
| 211 static const int kMinRescanLength = 800; // Don't bother rescanning for |
| 212 // unreliable encoding if fewer |
| 213 // than this many bytes unscanned. |
| 214 // We will rescan at most last half |
| 215 // of this. |
| 216 |
| 217 static const int kStrongBinary = 12; // Make F_BINARY the only encoding |
| 218 static const int kWeakerBinary = 4; // Make F_BINARY likely encoding |
| 219 |
| 220 // These are byte counts from front of file |
| 221 static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII |
| 222 static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII |
| 223 |
| 224 // We try here to avoid having title text dominate the encoding detection, |
| 225 // for the not-infrequent error case of title in encoding1, body in encoding2: |
| 226 // we want to bias toward encoding2 winning. |
| 227 // |
| 228 // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we |
| 229 // rarely cut off mid-character in the original (not-yet-detected) encoding. |
| 230 // This matters most for UTF-8 two- and three-byte codes and for |
| 231 // Shift-JIS three-byte codes. |
| 232 static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text |
| 233 static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc. |
| 234 // 1/16 normal weight |
| 235 |
| 236 static const int kStrongPairs = 6; // Let reliable enc with this many |
| 237 // pairs overcome missing hint |
| 238 |
| 239 enum CEDInternalFlags { |
| 240 kCEDNone = 0, // The empty flag |
| 241 kCEDRescanning = 1, // Do not further recurse |
| 242 kCEDSlowscore = 2, // Do extra scoring |
| 243 kCEDForceTags = 4, // Always examine text inside tags |
| 244 }; |
| 245 |
| 246 // Forward declaration |
| 247 Encoding InternalDetectEncoding( |
| 248 CEDInternalFlags flags, const char* text, int text_length, |
| 249 const char* url_hint, const char* http_charset_hint, |
| 250 const char* meta_charset_hint, const int encoding_hint, |
| 251 const Language language_hint, // User interface lang |
| 252 const CompactEncDet::TextCorpusType corpus_type, |
| 253 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable, |
| 254 Encoding* second_best_enc); |
| 255 |
| 256 typedef struct { |
| 257 const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas |
| 258 uint8 x_bar; // Average byte2 value |
| 259 uint8 y_bar; // Average byte1 value |
| 260 uint8 x_stddev; // Standard deviation of byte2 value |
| 261 uint8 y_stddev; // Standard deviation of byte1 value |
| 262 int so; // Scaling offset -- add to probabilities below |
| 263 const uint8 b1[256]; // Unigram probability for first byte of aligned bigram |
| 264 const uint8 b2[256]; // Unigram probability for second byte of aligned bigram |
| 265 const uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram |
| 266 } UnigramEntry; |
| 267 |
| 268 //typedef struct { |
| 269 // uint8 b12[256*256]; // Bigram probability for aligned bigram |
| 270 //} FullBigramEntry; |
| 271 |
| 272 |
| 273 // Include all the postproc-generated tables here: |
| 274 // RankedEncoding |
| 275 // kMapToEncoding |
| 276 // unigram_table |
| 277 // kMostLIkelyEncoding |
| 278 // kTLDHintProbs |
| 279 // kCharsetHintProbs |
| 280 // HintEntry, kMaxTldKey kMaxTldVector, etc. |
| 281 // ============================================================================= |
| 282 |
| 283 #include "encodings/compact_enc_det/compact_enc_det_generated_tables.h" |
| 284 |
| 285 |
| 286 #define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1" |
| 287 |
| 288 #define F_BINARY F_X_BINARYENC // We are mid-update for name change |
| 289 #define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change |
| 290 #define F_BIG5_CP950 F_BIG5 // We are mid-update for name change |
| 291 #define F_Unicode F_UTF_16LE // We are mid-update for name change |
| 292 // ============================================================================= |
| 293 |
| 294 // 7-bit encodings have at least one "interesting" byte value < 0x80 |
| 295 // (00 0E 1B + ~) |
| 296 // JIS 2022-cn 2022-kr hz utf7 |
| 297 // Unicode UTF-16 UTF-32 |
| 298 // 8-bit encodings have no interesting byte values < 0x80 |
| 299 static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect |
| 300 static const uint32 kUTF7Active = 0x00000002; // <80 and + |
| 301 static const uint32 kHzActive = 0x00000004; // <80 and ~ |
| 302 static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F |
| 303 static const uint32 kUTF8Active = 0x00000010; |
| 304 static const uint32 kUTF8UTF8Active = 0x00000020; |
| 305 static const uint32 kUTF1632Active = 0x00000040; // <80 and 00 |
| 306 static const uint32 kBinaryActive = 0x00000080; // <80 and 00 |
| 307 static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx |
| 308 static const uint32 kIsIndicCode = 0x00000200; // |
| 309 static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx |
| 310 static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx |
| 311 static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase |
| 312 |
| 313 |
| 314 // Debug only. not thread safe |
| 315 static int encdet_used = 0; |
| 316 static int rescore_used = 0; |
| 317 static int rescan_used = 0; |
| 318 static int robust_used = 0; |
| 319 static int looking_used = 0; |
| 320 static int doing_used = 0; |
| 321 |
| 322 |
| 323 // For debugging only -- about 256B/entry times about 500 = 128KB |
| 324 // TODO: only allocate this if being used |
| 325 typedef struct { |
| 326 int offset; |
| 327 int best_enc; // Best ranked encoding for this bigram, or |
| 328 // -1 for overhead entries |
| 329 string label; |
| 330 int detail_enc_prob[NUM_RANKEDENCODING]; |
| 331 } DetailEntry; |
| 332 |
| 333 static int watch1_rankedenc = -1; // Debug. not threadsafe |
| 334 static int watch2_rankedenc = -1; // Debug. not threadsafe |
| 335 ////static int next_detail_entry = 0; // Debug. not threadsafe |
| 336 ////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram |
| 337 // End For debugging only |
| 338 |
| 339 // Must match kTestPrintableAsciiTildePlus exit codes, minus one |
| 340 enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2}; |
| 341 |
| 342 // The reasons for pruning |
| 343 enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL}; |
| 344 |
| 345 static const char* kWhatSetName[] = {"Ascii", "Other"}; |
| 346 |
| 347 |
| 348 // State for encodings that do shift-out/shift-in between one- and two-byte |
| 349 // regions (ISO-2022-xx, HZ) |
| 350 enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE}; |
| 351 |
| 352 typedef struct { |
| 353 const uint8* initial_src; // For calculating byte offsets |
| 354 const uint8* limit_src; // Range of input source |
| 355 const uint8* prior_src; // Source consumed by prior call to BoostPrune |
| 356 const uint8* last_pair; // Last pair inserted into interesting_pairs |
| 357 |
| 358 DetailEntry* debug_data; // Normally NULL. Ptr to debug data for |
| 359 // FLAGS_enc_detect_detail PostScript data |
| 360 int next_detail_entry; // Debug |
| 361 |
| 362 bool done; |
| 363 bool reliable; |
| 364 bool hints_derated; |
| 365 int declared_enc_1; // From http/meta hint |
| 366 int declared_enc_2; // from http/meta hint |
| 367 int prune_count; // Number of times we have pruned |
| 368 |
| 369 int trigram_highwater_mark; // Byte offset of last trigram processing |
| 370 bool looking_for_latin_trigrams; // True if we should test for doing |
| 371 // Latin1/2/7 trigram processing |
| 372 bool do_latin_trigrams; // True if we actually are scoring trigrams |
| 373 |
| 374 // Miscellaneous state variables for difficult encodings |
| 375 int binary_quadrants_count; // Number of four bigram quadrants seen: |
| 376 // 0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx |
| 377 // 1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx |
| 378 int binary_8x4_count; // Number of 8x4 buckets seen: |
| 379 uint32 binary_quadrants_seen; // Bit[i] set if bigram i.......i....... seen |
| 380 uint32 binary_8x4_seen; // Bit[i] set if bigram iii.....ii...... seen |
| 381 int utf7_starts; // Count of possible UTF-7 beginnings seen |
| 382 int prior_utf7_offset; // Source consumed by prior UTF-7 string |
| 383 int next_utf8_ministate; // Mini state for UTF-8 sequences |
| 384 int utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors |
| 385 int next_utf8utf8_ministate; // Mini state for UTF8UTF8 sequences |
| 386 int utf8utf8_odd_byte; // UTF8UTF8 seq has odd number of bytes |
| 387 int utf8utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors |
| 388 StateSoSi next_2022_state; // Mini state for 2022 sequences |
| 389 StateSoSi next_hz_state; // Mini state for HZ sequences |
| 390 bool next_eucjp_oddphase; // Mini state for EUC-JP sequences |
| 391 int byte32_count[8]; // Count of top 3 bits of byte1 of bigram |
| 392 // 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx |
| 393 uint32 active_special; // Bits showing which special cases are active |
| 394 |
| 395 Encoding tld_hint; // Top TLD encoding or UNKNOWN |
| 396 Encoding http_hint; // What the document says about itself or |
| 397 Encoding meta_hint; // UNKNOWN_ENCODING. BOM is initial byte |
| 398 Encoding bom_hint; // order mark for UTF-xx |
| 399 |
| 400 // small cache of previous interesting bigrams |
| 401 int next_prior_bigram; |
| 402 int prior_bigram[4]; |
| 403 int prior_binary[1]; |
| 404 |
| 405 int top_rankedencoding; // Top two probabilities and families |
| 406 int second_top_rankedencoding; |
| 407 int top_prob; |
| 408 int second_top_prob; |
| 409 int prune_difference; // Prune things this much below the top prob |
| 410 int rankedencoding_list_len; // Number of active encodings |
| 411 int rankedencoding_list[NUM_RANKEDENCODING]; // List of active encodings |
| 412 // |
| 413 int enc_prob[NUM_RANKEDENCODING]; // Cumulative probability per enc |
| 414 // This is where all the action is |
| 415 int hint_prob[NUM_RANKEDENCODING]; // Initial hint probabilities |
| 416 int hint_weight[NUM_RANKEDENCODING]; // Number of hints for this enc |
| 417 |
| 418 // Two sets -- one for printable ASCII, one for the rest |
| 419 int prior_interesting_pair[NUM_PAIR_SETS]; // Pairs consumed by prior call |
| 420 int next_interesting_pair[NUM_PAIR_SETS]; // Next pair to write |
| 421 char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2]; // Two bytes per pair |
| 422 int interesting_offsets[NUM_PAIR_SETS][kMaxPairs]; // Src offset of pair |
| 423 int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs]; // weightshift of pair |
| 424 } DetectEncodingState; |
| 425 |
| 426 |
| 427 // Record a debug event that changes probabilities |
| 428 void SetDetailsEncProb(DetectEncodingState* destatep, |
| 429 int offset, int best_enc, const char* label) { |
| 430 int next = destatep->next_detail_entry; |
| 431 destatep->debug_data[next].offset = offset; |
| 432 destatep->debug_data[next].best_enc = best_enc; |
| 433 destatep->debug_data[next].label = label; |
| 434 memcpy(&destatep->debug_data[next].detail_enc_prob, |
| 435 &destatep->enc_prob, |
| 436 sizeof(destatep->enc_prob)); |
| 437 ++destatep->next_detail_entry; |
| 438 } |
| 439 |
| 440 // Record a debug event that changes probabilities, copy offset |
| 441 void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep, |
| 442 int best_enc, const char* label) { |
| 443 int next = destatep->next_detail_entry; |
| 444 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset; |
| 445 destatep->debug_data[next].best_enc = best_enc; |
| 446 destatep->debug_data[next].label = label; |
| 447 memcpy(&destatep->debug_data[next].detail_enc_prob, |
| 448 &destatep->enc_prob, |
| 449 sizeof(destatep->enc_prob)); |
| 450 ++destatep->next_detail_entry; |
| 451 } |
| 452 |
| 453 // Record a debug event that changes probs and has simple text label |
| 454 void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) { |
| 455 int next = destatep->next_detail_entry; |
| 456 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset; |
| 457 destatep->debug_data[next].best_enc = -1; |
| 458 destatep->debug_data[next].label = label; |
| 459 memcpy(&destatep->debug_data[next].detail_enc_prob, |
| 460 &destatep->enc_prob, |
| 461 sizeof(destatep->enc_prob)); |
| 462 ++destatep->next_detail_entry; |
| 463 } |
| 464 |
| 465 // Record a debug event that is just a text label, no change in probs |
| 466 void SetDetailsLabel(DetectEncodingState* destatep, const char* label) { |
| 467 int next = destatep->next_detail_entry; |
| 468 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset; |
| 469 destatep->debug_data[next].best_enc = -1; |
| 470 destatep->debug_data[next].label = label; |
| 471 memcpy(&destatep->debug_data[next].detail_enc_prob, |
| 472 &destatep->debug_data[next - 1].detail_enc_prob, |
| 473 sizeof(destatep->enc_prob)); |
| 474 ++destatep->next_detail_entry; |
| 475 } |
| 476 |
| 477 |
| 478 // Maps superset encodings to base, to see if 2 encodings are compatible |
| 479 // (Non-identity mappings are marked "-->" below.) |
| 480 static const Encoding kMapEncToBaseEncoding[] = { |
| 481 ISO_8859_1, // 0: Teragram ASCII |
| 482 ISO_8859_2, // 1: Teragram Latin2 |
| 483 ISO_8859_3, // 2: in BasisTech but not in Teragram |
| 484 ISO_8859_4, // 3: Teragram Latin4 |
| 485 ISO_8859_5, // 4: Teragram ISO-8859-5 |
| 486 ISO_8859_6, // 5: Teragram Arabic |
| 487 ISO_8859_7, // 6: Teragram Greek |
| 488 MSFT_CP1255, // 7: Teragram Hebrew --> 36 |
| 489 ISO_8859_9, // 8: in BasisTech but not in Teragram |
| 490 ISO_8859_10, // 9: in BasisTech but not in Teragram |
| 491 JAPANESE_EUC_JP, // 10: Teragram EUC_JP |
| 492 JAPANESE_SHIFT_JIS, // 11: Teragram SJS |
| 493 JAPANESE_JIS, // 12: Teragram JIS |
| 494 CHINESE_BIG5, // 13: Teragram BIG5 |
| 495 CHINESE_GB, // 14: Teragram GB |
| 496 CHINESE_EUC_CN, // 15: Teragram EUC-CN |
| 497 KOREAN_EUC_KR, // 16: Teragram KSC |
| 498 UNICODE, // 17: Teragram Unicode |
| 499 CHINESE_EUC_CN, // 18: Teragram EUC --> 15 |
| 500 CHINESE_EUC_CN, // 19: Teragram CNS --> 15 |
| 501 CHINESE_BIG5, // 20: Teragram BIG5_CP950 --> 13 |
| 502 JAPANESE_SHIFT_JIS, // 21: Teragram CP932 --> 11 |
| 503 UTF8, // 22 |
| 504 UNKNOWN_ENCODING, // 23 |
| 505 ISO_8859_1, // 24: ISO_8859_1 with all characters <= 127 --> 0 |
| 506 RUSSIAN_KOI8_R, // 25: Teragram KOI8R |
| 507 RUSSIAN_CP1251, // 26: Teragram CP1251 |
| 508 ISO_8859_1, // 27: CP1252 aka MSFT euro ascii --> 0 |
| 509 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian |
| 510 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european |
| 511 ISO_8859_1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0 |
| 512 ISO_8859_9, // 31: used for Turkish |
| 513 ISO_8859_13, // 32: used in Baltic countries --> 43 |
| 514 ISO_8859_11, // 33: aka TIS-620, used for Thai |
| 515 ISO_8859_11, // 34: used for Thai --> 33 |
| 516 MSFT_CP1256, // 35: used for Arabic |
| 517 MSFT_CP1255, // 36: Logical Hebrew Microsoft |
| 518 MSFT_CP1255, // 37: Iso Hebrew Logical --> 36 |
| 519 MSFT_CP1255, // 38: Iso Hebrew Visual --> 36 |
| 520 CZECH_CP852, // 39 |
| 521 ISO_8859_2, // 40: aka ISO_IR_139 aka KOI8_CS --> 1 |
| 522 MSFT_CP1253, // 41: used for Greek, but NOT a superset of 8859-7 |
| 523 RUSSIAN_CP866, // 42 |
| 524 ISO_8859_13, // 43 |
| 525 ISO_2022_KR, // 44 |
| 526 CHINESE_GB, // 45 GBK --> 14 |
| 527 CHINESE_GB, // 46 GB18030 --> 14 |
| 528 CHINESE_BIG5, // 47 BIG5_HKSCS --> 13 |
| 529 ISO_2022_KR, // 48 ISO_2022_CN --> 44 |
| 530 TSCII, // 49 Indic encoding |
| 531 TAMIL_MONO, // 50 Indic encoding - Tamil |
| 532 TAMIL_BI, // 51 Indic encoding - Tamil |
| 533 JAGRAN, // 52 Indic encoding - Devanagari |
| 534 MACINTOSH_ROMAN, // 53 |
| 535 UTF7, // 54 |
| 536 BHASKAR, // 55 Indic encoding - Devanagari |
| 537 HTCHANAKYA, // 56 Indic encoding - Devanagari |
| 538 UTF16BE, // 57 |
| 539 UTF16LE, // 58 |
| 540 UTF32BE, // 59 |
| 541 UTF32LE, // 60 |
| 542 BINARYENC, // 61 |
| 543 HZ_GB_2312, // 62 |
| 544 UTF8UTF8, // 63 |
| 545 TAM_ELANGO, // 64 Elango - Tamil |
| 546 TAM_LTTMBARANI, // 65 Barani - Tamil |
| 547 TAM_SHREE, // 66 Shree - Tamil |
| 548 TAM_TBOOMIS, // 67 TBoomis - Tamil |
| 549 TAM_TMNEWS, // 68 TMNews - Tamil |
| 550 TAM_WEBTAMIL, // 69 Webtamil - Tamil |
| 551 KDDI_SHIFT_JIS, // 70 KDDI Shift_JIS |
| 552 DOCOMO_SHIFT_JIS, // 71 DoCoMo Shift_JIS |
| 553 SOFTBANK_SHIFT_JIS, // 72 SoftBank Shift_JIS |
| 554 KDDI_ISO_2022_JP, // 73 KDDI ISO-2022-JP |
| 555 SOFTBANK_ISO_2022_JP, // 74 SOFTBANK ISO-2022-JP |
| 556 }; |
| 557 |
| 558 COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS, |
| 559 kMapEncToBaseEncoding_has_incorrect_size); |
| 560 |
| 561 // Maps base encodings to 0, supersets to 1+, undesired to -1 |
| 562 // (Non-identity mappings are marked "-->" below.) |
| 563 static const int kMapEncToSuperLevel[] = { |
| 564 0, // 0: Teragram ASCII |
| 565 0, // 1: Teragram Latin2 |
| 566 0, // 2: in BasisTech but not in Teragram |
| 567 0, // 3: Teragram Latin4 |
| 568 0, // 4: Teragram ISO-8859-5 |
| 569 0, // 5: Teragram Arabic |
| 570 0, // 6: Teragram Greek |
| 571 0, // 7: Teragram Hebrew |
| 572 0, // 8: in BasisTech but not in Teragram |
| 573 0, // 9: in BasisTech but not in Teragram |
| 574 0, // 10: Teragram EUC_JP |
| 575 0, // 11: Teragram SJS |
| 576 0, // 12: Teragram JIS |
| 577 0, // 13: Teragram BIG5 |
| 578 0, // 14: Teragram GB |
| 579 0, // 15: Teragram EUC-CN |
| 580 0, // 16: Teragram KSC |
| 581 0, // 17: Teragram Unicode |
| 582 -1, // 18: Teragram EUC --> 15 |
| 583 -1, // 19: Teragram CNS --> 15 |
| 584 1, // 20: Teragram BIG5_CP950 --> 13 |
| 585 1, // 21: Teragram CP932 --> 11 |
| 586 0, // 22 |
| 587 -1, // 23 |
| 588 -1, // 24: ISO_8859_1 with all characters <= 127 --> 0 |
| 589 0, // 25: Teragram KOI8R |
| 590 0, // 26: Teragram CP1251 |
| 591 1, // 27: CP1252 aka MSFT euro ascii --> 0 |
| 592 0, // 28: CP21866 aka KOI8_RU, used for Ukrainian |
| 593 0, // 29: CP1250 aka MSFT eastern european |
| 594 1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0 |
| 595 0, // 31: used for Turkish |
| 596 1, // 32: used in Baltic countries --> 43 |
| 597 0, // 33: aka TIS-620, used for Thai |
| 598 1, // 34: used for Thai --> 33 |
| 599 0, // 35: used for Arabic |
| 600 0, // 36: Logical Hebrew Microsoft |
| 601 -1, // 37: Iso Hebrew Logical --> 36 |
| 602 -1, // 38: Iso Hebrew Visual --> 7 |
| 603 0, // 39 |
| 604 1, // 40: aka ISO_IR_139 aka KOI8_CS --> 1 |
| 605 0, // 41: used for Greek, NOT superset of 8859-7 |
| 606 0, // 42 |
| 607 0, // 43 |
| 608 0, // 44 |
| 609 1, // 45 GBK --> 14 |
| 610 1, // 46 GB18030 --> 14 |
| 611 1, // 47 BIG5_HKSCS --> 13 |
| 612 1, // 48 ISO_2022_CN --> 44 |
| 613 0, // 49 Indic encoding |
| 614 0, // 50 Indic encoding - Tamil |
| 615 0, // 51 Indic encoding - Tamil |
| 616 0, // 52 Indic encoding - Devanagari |
| 617 0, // 53 |
| 618 0, // 54 |
| 619 0, // 55 Indic encoding - Devanagari |
| 620 0, // 56 Indic encoding - Devanagari |
| 621 0, // 57 |
| 622 0, // 58 |
| 623 0, // 59 |
| 624 0, // 60 |
| 625 0, // 61 |
| 626 0, // 62 |
| 627 2, // 63 |
| 628 0, 0, 0, 0, 0, 0, // add six more Tamil |
| 629 0, 0, 0, 0, 0, // add five encodings with emoji |
| 630 }; |
| 631 |
| 632 COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS, |
| 633 kMapEncToSuperLevel_has_incorrect_size); |
| 634 |
| 635 |
| 636 |
| 637 // Subscripted by Encoding enum value |
| 638 static const uint32 kSpecialMask[] = { |
| 639 kHighAccentCode, // 0 |
| 640 kHighAccentCode, |
| 641 kHighAccentCode, |
| 642 kHighAccentCode, |
| 643 kHighAlphaCode, // 4 |
| 644 kHighAlphaCode, |
| 645 kHighAlphaCode, |
| 646 kHighAlphaCode, |
| 647 kHighAccentCode, |
| 648 kHighAccentCode, |
| 649 |
| 650 kTwobyteCode + kEUCJPActive, // 10 euc-jp |
| 651 kTwobyteCode, |
| 652 kSevenBitActive + kIso2022Active, // jis |
| 653 kTwobyteCode, |
| 654 kTwobyteCode, |
| 655 kTwobyteCode, |
| 656 kTwobyteCode, |
| 657 kSevenBitActive + kUTF1632Active, // Unicode |
| 658 kTwobyteCode, |
| 659 kTwobyteCode, |
| 660 |
| 661 kTwobyteCode, // 20 |
| 662 kTwobyteCode, |
| 663 kUTF8Active, // UTF-8 |
| 664 0, |
| 665 0, |
| 666 kHighAlphaCode, // 25 |
| 667 kHighAlphaCode, |
| 668 kHighAccentCode, |
| 669 kHighAlphaCode, |
| 670 kHighAccentCode, |
| 671 |
| 672 kHighAccentCode, // 30 |
| 673 kHighAccentCode, |
| 674 kHighAccentCode, |
| 675 kHighAlphaCode, |
| 676 kHighAlphaCode, |
| 677 kHighAlphaCode, // 35 |
| 678 kHighAlphaCode, |
| 679 kHighAlphaCode, |
| 680 kHighAlphaCode, |
| 681 0, |
| 682 |
| 683 0, // 40 |
| 684 kHighAlphaCode, |
| 685 kHighAlphaCode, |
| 686 kHighAccentCode, |
| 687 kSevenBitActive + kIso2022Active, // 2022-kr |
| 688 kTwobyteCode, |
| 689 kTwobyteCode, |
| 690 kTwobyteCode, |
| 691 kSevenBitActive + kIso2022Active, // 2022-cn |
| 692 kHighAlphaCode + kIsIndicCode, // 49 TSCII |
| 693 |
| 694 kHighAlphaCode + kIsIndicCode, // 50 TAMIL_MONO |
| 695 kHighAlphaCode + kIsIndicCode, // 51 TAMIL_BI |
| 696 kHighAlphaCode + kIsIndicCode, // 52 JAGRAN |
| 697 kHighAccentCode, // 53 MACINTOSH_ROMAN |
| 698 kSevenBitActive + kUTF7Active, // 54 UTF-7 |
| 699 kHighAlphaCode + kIsIndicCode, // 55 BHASKAR Indic encoding - Devanagari |
| 700 kHighAlphaCode + kIsIndicCode, // 56 HTCHANAKYA Indic encoding - Devanag
ari |
| 701 kSevenBitActive + kUTF1632Active, // 57 UTF16BE |
| 702 kSevenBitActive + kUTF1632Active, // 58 UTF16LE |
| 703 kSevenBitActive + kUTF1632Active, // 59 UTF32BE |
| 704 kSevenBitActive + kUTF1632Active, // 60 UTF32LE |
| 705 |
| 706 kSevenBitActive + kBinaryActive, // 61 BINARYENC |
| 707 kSevenBitActive + kHzActive, // 62 HZ_GB_2312 |
| 708 kHighAccentCode + kUTF8Active + kUTF8UTF8Active, // 63 UTF8UTF8 |
| 709 kHighAlphaCode + kIsIndicCode, // 64 Elango - Tamil |
| 710 kHighAlphaCode + kIsIndicCode, // 65 Barani - Tamil |
| 711 kHighAlphaCode + kIsIndicCode, // 66 Shree - Tamil |
| 712 kHighAlphaCode + kIsIndicCode, // 67 TBoomis - Tamil |
| 713 kHighAlphaCode + kIsIndicCode, // 68 TMNews - Tamil |
| 714 kHighAlphaCode + kIsIndicCode, // 69 Webtamil - Tamil |
| 715 kTwobyteCode, // 70 KDDI Shift_JIS |
| 716 kTwobyteCode, // 71 DoCoMo Shift_JIS |
| 717 kTwobyteCode, // 72 SoftBank Shift_JIS |
| 718 kSevenBitActive + kIso2022Active, // 73 KDDI-ISO-2022-JP |
| 719 kSevenBitActive + kIso2022Active, // 74 SOFTBANK-ISO-2022-JP |
| 720 }; |
| 721 |
| 722 COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS, |
| 723 kSpecialMask_has_incorrect_size); |
| 724 |
| 725 |
| 726 /*** |
| 727 kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents |
| 728 |
| 729 ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd |
| 730 RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef |
| 731 RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef |
| 732 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef |
| 733 RUSSIAN_CP866, // 42 89ae |
| 734 |
| 735 ISO_8859_6, // 5: Teragram Arabic nocase cde |
| 736 MSFT_CP1256, // 35: used for Arabic nocase cde |
| 737 |
| 738 ISO_8859_7, // 6: Teragram Greek UL cdef |
| 739 MSFT_CP1253, // 41: used for Greek UL cdef |
| 740 |
| 741 ISO_8859_8, // 7: Teragram Hebrew nocase ef |
| 742 MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef |
| 743 ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef |
| 744 HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef |
| 745 |
| 746 ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde |
| 747 MSFT_CP874, // 34: used for Thai nocase abcde |
| 748 |
| 749 TSCII, // 49 8-f |
| 750 TAMIL_MONO, // 50 |
| 751 TAMIL_BI, // 51 |
| 752 JAGRAN, // 52 |
| 753 BHASKAR, // 55 Indic encoding - Devanagari |
| 754 HTCHANAKYA, // 56 Indic encoding - Devanagari |
| 755 ***/ |
| 756 |
| 757 // We can scan bytes using this at about 500 MB/sec 2.8GHz P4 |
| 758 // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~ |
| 759 // We allow FF, 0x0C, here because it gives a better result for old |
| 760 // Ascii text formatted for a TTY |
| 761 // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise |
| 762 static const char kTestPrintableAsciiTildePlus[256] = { |
| 763 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 764 0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 765 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 766 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2, |
| 767 |
| 768 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 769 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 770 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 771 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 772 }; |
| 773 |
| 774 // We can scan bytes using this at about 550 MB/sec 2.8GHz P4 |
| 775 // Slow scan uses this, stopping on NUL ESC SO SI and bad C0 |
| 776 // after Hz and UTF7 are pruned away |
| 777 // We allow Form Feed, 0x0C, here |
| 778 static const char kTestPrintableAscii[256] = { |
| 779 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 780 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 781 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 782 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2, |
| 783 |
| 784 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 785 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 786 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 787 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 788 }; |
| 789 |
| 790 // Used in first-four-byte testing |
| 791 static const char kIsPrintableAscii[256] = { |
| 792 0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 793 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 794 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 795 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0, |
| 796 |
| 797 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 798 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 799 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 800 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 801 }; |
| 802 |
| 803 |
| 804 static const signed char kBase64Value[256] = { |
| 805 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 806 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 807 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63, |
| 808 52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1, |
| 809 |
| 810 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, |
| 811 15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1, |
| 812 -1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40, |
| 813 41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1, |
| 814 |
| 815 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 816 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 817 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 818 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 819 |
| 820 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 821 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 822 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 823 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| 824 }; |
| 825 |
| 826 |
| 827 // Subscripted by <state, byte/16> |
| 828 // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x |
| 829 // |
| 830 // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9 |
| 831 // which we can mis-parse as an error byte followed by good UTF-8: |
| 832 // B2 DBB8 D6BD E1B9B9 |
| 833 // To counteract this, we now require an ASCII7 byte to resync out |
| 834 // of the error state |
| 835 // Next problem: good UTF-8 with bad byte |
| 836 // efbc a012 eea4 bee7 b280 c2b7 |
| 837 // efbca0 12 eea4be e7b280 c2b7 |
| 838 // ^^ bad byte |
| 839 // fix: change state0 byte 1x to be don't-care |
| 840 // |
| 841 // Short UTF-8 ending in ASCII7 byte should resync immediately: |
| 842 // E0 20 E0 A6 AA should give one error and resync at 2nd E0 |
| 843 // |
| 844 static const char kMiniUTF8State[8][16] = { |
| 845 {0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,}, // [0] start char (allow cr/lf/ht) |
| 846 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [1] continue 1 of 2 |
| 847 {0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,}, // [2] continue 1 of 3 |
| 848 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [3] continue 2 of 3 |
| 849 {0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,}, // [4] continue 1 of 4 |
| 850 {0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,}, // [5] continue 2 of 4 |
| 851 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [6] continue 3 of 4 |
| 852 {0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,}, // [7] error, soak up continues, |
| 853 // ONLY resync after Ascii char |
| 854 // then restart |
| 855 }; |
| 856 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B |
| 857 static const char kMiniUTF8Count[8][16] = { |
| 858 {0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,}, // [0] start char (allow cr/lf/ht) |
| 859 {1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,}, // [1] continue 1 of 2 |
| 860 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [2] continue 1 of 3 |
| 861 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [3] continue 2 of 3 |
| 862 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [4] continue 1 of 4 |
| 863 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] continue 2 of 4 |
| 864 {1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,}, // [6] continue 3 of 4 |
| 865 {0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,}, // [7] error, soak up continues, |
| 866 // then restart |
| 867 }; |
| 868 |
| 869 // Subscripted by <state, f(byte1) + g(byte2)> |
| 870 // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise |
| 871 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc. |
| 872 // (no checking for illegal bytes) |
| 873 // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want |
| 874 // to detect two, so we can back-convert to one. |
| 875 // zero one two pattern |
| 876 // ---- ------ ---------------- ----------------- |
| 877 // 81 C281 C382C281 C3->8x->C2->xx |
| 878 // 98 CB9C C38BC593 C3->8x->C5->xx |
| 879 // C3 C383 C383C692 C3->8x->C6->xx |
| 880 // C8 C388 C383CB86 C3->8x->CB->xx |
| 881 // 83 C692 C386E28099 C3->8x->E2->xx->8x |
| 882 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx |
| 883 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx |
| 884 // |
| 885 // We also want to detect bare-byte extra UTF-8 conversions: |
| 886 // zero one two pattern |
| 887 // ---- ------ ---------------- ----------------- |
| 888 // C3 C3 C383 C3->8x->C2->xx |
| 889 // D3 D3 C393 C3->9x->C2->xx->C2->xx |
| 890 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx |
| 891 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx |
| 892 // |
| 893 |
| 894 /** |
| 895 CP1252 => UTF8 => UTF8UTF8 |
| 896 80 => E282AC => C3A2E2809AC2AC |
| 897 81 => C281 => C382C281 |
| 898 82 => E2809A => C3A2E282ACC5A1 |
| 899 83 => C692 => C386E28099 |
| 900 84 => E2809E => C3A2E282ACC5BE |
| 901 85 => E280A6 => C3A2E282ACC2A6 |
| 902 86 => E280A0 => C3A2E282ACC2A0 |
| 903 87 => E280A1 => C3A2E282ACC2A1 |
| 904 88 => CB86 => C38BE280A0 |
| 905 89 => E280B0 => C3A2E282ACC2B0 |
| 906 8A => C5A0 => C385C2A0 |
| 907 8B => E280B9 => C3A2E282ACC2B9 |
| 908 8C => C592 => C385E28099 |
| 909 8D => C28D => C382C28D |
| 910 8E => C5BD => C385C2BD |
| 911 8F => C28F => C382C28F |
| 912 90 => C290 => C382C290 |
| 913 91 => E28098 => C3A2E282ACCB9C |
| 914 92 => E28099 => C3A2E282ACE284A2 |
| 915 93 => E2809C => C3A2E282ACC593 |
| 916 94 => E2809D => C3A2E282ACC29D |
| 917 95 => E280A2 => C3A2E282ACC2A2 |
| 918 96 => E28093 => C3A2E282ACE2809C |
| 919 97 => E28094 => C3A2E282ACE2809D |
| 920 98 => CB9C => C38BC593 |
| 921 99 => E284A2 => C3A2E2809EC2A2 |
| 922 9A => C5A1 => C385C2A1 |
| 923 9B => E280BA => C3A2E282ACC2BA |
| 924 9C => C593 => C385E2809C |
| 925 9D => C29D => C382C29D |
| 926 9E => C5BE => C385C2BE |
| 927 9F => C5B8 => C385C2B8 |
| 928 A0 => C2A0 => C382C2A0 |
| 929 A1 => C2A1 => C382C2A1 |
| 930 A2 => C2A2 => C382C2A2 |
| 931 A3 => C2A3 => C382C2A3 |
| 932 A4 => C2A4 => C382C2A4 |
| 933 A5 => C2A5 => C382C2A5 |
| 934 A6 => C2A6 => C382C2A6 |
| 935 A7 => C2A7 => C382C2A7 |
| 936 A8 => C2A8 => C382C2A8 |
| 937 A9 => C2A9 => C382C2A9 |
| 938 AA => C2AA => C382C2AA |
| 939 AB => C2AB => C382C2AB |
| 940 AC => C2AC => C382C2AC |
| 941 AD => C2AD => C382C2AD |
| 942 AE => C2AE => C382C2AE |
| 943 AF => C2AF => C382C2AF |
| 944 B0 => C2B0 => C382C2B0 |
| 945 B1 => C2B1 => C382C2B1 |
| 946 B2 => C2B2 => C382C2B2 |
| 947 B3 => C2B3 => C382C2B3 |
| 948 B4 => C2B4 => C382C2B4 |
| 949 B5 => C2B5 => C382C2B5 |
| 950 B6 => C2B6 => C382C2B6 |
| 951 B7 => C2B7 => C382C2B7 |
| 952 B8 => C2B8 => C382C2B8 |
| 953 B9 => C2B9 => C382C2B9 |
| 954 BA => C2BA => C382C2BA |
| 955 BB => C2BB => C382C2BB |
| 956 BC => C2BC => C382C2BC |
| 957 BD => C2BD => C382C2BD |
| 958 BE => C2BE => C382C2BE |
| 959 BF => C2BF => C382C2BF |
| 960 C0 => C380 => C383E282AC |
| 961 C1 => C381 => C383C281 |
| 962 C2 => C382 => C383E2809A |
| 963 C3 => C383 => C383C692 |
| 964 C4 => C384 => C383E2809E |
| 965 C5 => C385 => C383E280A6 |
| 966 C6 => C386 => C383E280A0 |
| 967 C7 => C387 => C383E280A1 |
| 968 C8 => C388 => C383CB86 |
| 969 C9 => C389 => C383E280B0 |
| 970 CA => C38A => C383C5A0 |
| 971 CB => C38B => C383E280B9 |
| 972 CC => C38C => C383C592 |
| 973 CD => C38D => C383C28D |
| 974 CE => C38E => C383C5BD |
| 975 CF => C38F => C383C28F |
| 976 D0 => C390 => C383C290 |
| 977 D1 => C391 => C383E28098 |
| 978 D2 => C392 => C383E28099 |
| 979 D3 => C393 => C383E2809C |
| 980 D4 => C394 => C383E2809D |
| 981 D5 => C395 => C383E280A2 |
| 982 D6 => C396 => C383E28093 |
| 983 D7 => C397 => C383E28094 |
| 984 D8 => C398 => C383CB9C |
| 985 D9 => C399 => C383E284A2 |
| 986 DA => C39A => C383C5A1 |
| 987 DB => C39B => C383E280BA |
| 988 DC => C39C => C383C593 |
| 989 DD => C39D => C383C29D |
| 990 DE => C39E => C383C5BE |
| 991 DF => C39F => C383C5B8 |
| 992 E0 => C3A0 => C383C2A0 |
| 993 E1 => C3A1 => C383C2A1 |
| 994 E2 => C3A2 => C383C2A2 |
| 995 E3 => C3A3 => C383C2A3 |
| 996 E4 => C3A4 => C383C2A4 |
| 997 E5 => C3A5 => C383C2A5 |
| 998 E6 => C3A6 => C383C2A6 |
| 999 E7 => C3A7 => C383C2A7 |
| 1000 E8 => C3A8 => C383C2A8 |
| 1001 E9 => C3A9 => C383C2A9 |
| 1002 EA => C3AA => C383C2AA |
| 1003 EB => C3AB => C383C2AB |
| 1004 EC => C3AC => C383C2AC |
| 1005 ED => C3AD => C383C2AD |
| 1006 EE => C3AE => C383C2AE |
| 1007 EF => C3AF => C383C2AF |
| 1008 F0 => C3B0 => C383C2B0 |
| 1009 F1 => C3B1 => C383C2B1 |
| 1010 F2 => C3B2 => C383C2B2 |
| 1011 F3 => C3B3 => C383C2B3 |
| 1012 F4 => C3B4 => C383C2B4 |
| 1013 F5 => C3B5 => C383C2B5 |
| 1014 F6 => C3B6 => C383C2B6 |
| 1015 F7 => C3B7 => C383C2B7 |
| 1016 F8 => C3B8 => C383C2B8 |
| 1017 F9 => C3B9 => C383C2B9 |
| 1018 FA => C3BA => C383C2BA |
| 1019 FB => C3BB => C383C2BB |
| 1020 FC => C3BC => C383C2BC |
| 1021 FD => C3BD => C383C2BD |
| 1022 FE => C3BE => C383C2BE |
| 1023 FF => C3BF => C383C2BF |
| 1024 **/ |
| 1025 |
| 1026 // Subscripted by <state, f(byte1) + g(byte2)> |
| 1027 // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise |
| 1028 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc. |
| 1029 |
| 1030 // 81 C281 C382C281 C3->8x->C2->xx |
| 1031 // 98 CB9C C38BC593 C3->8x->C5->xx |
| 1032 // C3 C383 C383C692 C3->8x->C6->xx |
| 1033 // C8 C388 C383CB86 C3->8x->CB->xx |
| 1034 // [0] [2] [0] |
| 1035 // 83 C692 C386E28099 C3->8x->E2->xx->xx |
| 1036 // odd_byte=0 [0] [2] [0+] odd_byte flipped |
| 1037 // odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped |
| 1038 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx |
| 1039 // odd_byte=0 [0] [3] [4] [0+] |
| 1040 // odd_byte=1 [0+] [3] [4] [4] [0] |
| 1041 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx |
| 1042 // odd_byte=0 [0] [3] [4] [0] [0] |
| 1043 // odd_byte=1 [0+] [3] [4] [4] [0+] |
| 1044 // |
| 1045 // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip |
| 1046 // the odd_byte state. If that goes from 0 to 1, the next pair is offset up |
| 1047 // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes |
| 1048 // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx. |
| 1049 // These are absorbed with no error in state 0 or state 4 |
| 1050 // |
| 1051 // C3 C3 C383 C3->8x->C2->xx |
| 1052 // D3 D3 C393 C3->9x->C2->xx->C2->xx |
| 1053 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx |
| 1054 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx |
| 1055 // Counter3 for Fx Ex sequences is incremented at last C2 |
| 1056 |
| 1057 static const char kMiniUTF8UTF8State[8][16] = { |
| 1058 // xxxx E2xx CXxx C3xx |
| 1059 // 8 9 a b 8 9 a b 8 9 a b |
| 1060 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [0] looking for C38x/C3Ax/2020/8x
8x, or err |
| 1061 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [1] error, back to looking |
| 1062 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx |
| 1063 // + + + + // E2xxxx flips odd_byte |
| 1064 {1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,}, // [3] C3Ax looking for E2xx or C2xx
C2xx |
| 1065 // + + + + // E2xxxx flips odd_byte |
| 1066 {4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E
2xxxx |
| 1067 // + + + + // E2xxxx flips odd_byte |
| 1068 {1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC
2xx |
| 1069 {1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx |
| 1070 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [7] C3Bx -- looking for C2xx |
| 1071 }; |
| 1072 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B |
| 1073 static const char kMiniUTF8UTF8Count[8][16] = { |
| 1074 // xxxx E2xx C2Xx C3xx |
| 1075 // 8 9 a b 8 9 a b 8 9 a b |
| 1076 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x
8x, or err |
| 1077 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [1] error, back to looking |
| 1078 {1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx |
| 1079 // + + + + // E2xxxx flips odd_byte |
| 1080 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [3] C3Ax looking for E2xx |
| 1081 // + + + + // E2xxxx flips odd_byte |
| 1082 {1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E
2xxxx |
| 1083 // + + + + // E2xxxx flips odd_byte |
| 1084 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC
2xx |
| 1085 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx |
| 1086 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [7] C3Bx -- looking for C2xx |
| 1087 }; |
| 1088 |
| 1089 static const char kMiniUTF8UTF8Odd[8][16] = { |
| 1090 // xxxx E2xx C2Xx C3xx |
| 1091 // 8 9 a b 8 9 a b 8 9 a b |
| 1092 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x
8x, or err |
| 1093 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [1] error, back to looking |
| 1094 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [2] C38x looking for CXxx/E2xxxx |
| 1095 // + + + + // E2xxxx flips odd_byte |
| 1096 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [3] C3Ax looking for E2xx |
| 1097 // + + + + // E2xxxx flips odd_byte |
| 1098 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [4] C3AxE2xx-- looking for C2xx/E
2xxxx |
| 1099 // + + + + // E2xxxx flips odd_byte |
| 1100 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [5] C3Bx -- looking for C2xxC2xxC
2xx |
| 1101 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [6] C3Bx -- looking for C2xxC2xx |
| 1102 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [7] C3Bx -- looking for C2xx |
| 1103 }; |
| 1104 |
| 1105 // Turn a pair of bytes into the subscript for UTF8UTF8 tables above |
| 1106 int UTF88Sub(char s0, char s1) { |
| 1107 int sub = (s1 >> 4) & 0x03; |
| 1108 uint8 u0 = static_cast<uint8>(s0); |
| 1109 if (u0 == 0xc3) { |
| 1110 sub += 12; |
| 1111 } else if ((u0 & 0xf0) == 0xc0) { |
| 1112 if ((u0 == 0xc2) || (u0 == 0xc5) || (u0 == 0xc6) || (u0 == 0xcb)) { |
| 1113 sub += 8; |
| 1114 } |
| 1115 } else if (u0 == 0xe2) { |
| 1116 sub += 4; |
| 1117 } |
| 1118 return sub; |
| 1119 } |
| 1120 |
| 1121 |
| 1122 |
| 1123 |
| 1124 |
| 1125 // Default probability for an encoding rankedencoding |
| 1126 // Based on a scan of 55M web pages |
| 1127 // These values are 255 - log base 2**1/10 (occurrences / total) |
| 1128 // Large values are most likely. This the reverse of some Google code |
| 1129 // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M) |
| 1130 // |
| 1131 // TODO change this to be per encoding, not permuted |
| 1132 // |
| 1133 |
| 1134 |
| 1135 // Support function for unit test program |
| 1136 // Return ranked encoding corresponding to enc |
| 1137 // (also exported to compact_enc_det_text.cc) |
| 1138 int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) { |
| 1139 for (int i = 0; i < NUM_RANKEDENCODING; ++i) { |
| 1140 if (kMapToEncoding[i] == enc) { |
| 1141 return i; |
| 1142 } |
| 1143 } |
| 1144 return -1; |
| 1145 } |
| 1146 |
| 1147 |
| 1148 string DecodeActive(uint32 active) { |
| 1149 string temp(""); |
| 1150 if (active & kBinaryActive) { |
| 1151 temp.append("Binary "); |
| 1152 } |
| 1153 if (active & kUTF1632Active) { |
| 1154 temp.append("UTF1632 "); |
| 1155 } |
| 1156 if (active & kUTF8UTF8Active) { |
| 1157 temp.append("UTF8UTF8 "); |
| 1158 } |
| 1159 if (active & kUTF8Active) { |
| 1160 temp.append("UTF8 "); |
| 1161 } |
| 1162 if (active & kIso2022Active) { |
| 1163 temp.append("Iso2022 "); |
| 1164 } |
| 1165 if (active & kHzActive) { |
| 1166 temp.append("Hz "); |
| 1167 } |
| 1168 if (active & kUTF7Active) { |
| 1169 temp.append("UTF7A "); |
| 1170 } |
| 1171 if (active & kSevenBitActive) { |
| 1172 temp.append("SevenBit "); |
| 1173 } |
| 1174 if (active & kIsIndicCode) { |
| 1175 temp.append("Indic "); |
| 1176 } |
| 1177 if (active & kHighAlphaCode) { |
| 1178 temp.append("HighAlpha "); |
| 1179 } |
| 1180 if (active & kHighAccentCode) { |
| 1181 temp.append("HighAccent "); |
| 1182 } |
| 1183 if (active & kEUCJPActive) { |
| 1184 temp.append("EUCJP "); |
| 1185 } |
| 1186 return temp; |
| 1187 } |
| 1188 |
| 1189 static inline bool SevenBitEncoding(int enc) { |
| 1190 return ((kSpecialMask[enc] & kSevenBitActive) != 0); |
| 1191 } |
| 1192 static inline bool TwoByteEncoding(int enc) { |
| 1193 return ((kSpecialMask[enc] & kTwobyteCode) != 0); |
| 1194 } |
| 1195 static inline bool IndicEncoding(int enc) { |
| 1196 return ((kSpecialMask[enc] & kIsIndicCode) != 0); |
| 1197 } |
| 1198 static inline bool HighAlphaEncoding(int enc) { |
| 1199 return ((kSpecialMask[enc] & kHighAlphaCode) != 0); |
| 1200 } |
| 1201 static inline bool HighAccentEncoding(int enc) { |
| 1202 return ((kSpecialMask[enc] & kHighAccentCode) != 0); |
| 1203 } |
| 1204 |
| 1205 |
| 1206 static inline bool AnyActive(DetectEncodingState* destatep) { |
| 1207 return (destatep->active_special != 0); |
| 1208 } |
| 1209 static inline bool SevenBitActive(DetectEncodingState* destatep) { |
| 1210 return (destatep->active_special & kSevenBitActive) != 0; |
| 1211 } |
| 1212 |
| 1213 static inline bool UTF7Active(DetectEncodingState* destatep) { |
| 1214 return (destatep->active_special & kUTF7Active) != 0; |
| 1215 } |
| 1216 |
| 1217 static inline bool HzActive(DetectEncodingState* destatep) { |
| 1218 return (destatep->active_special & kHzActive) != 0; |
| 1219 } |
| 1220 static inline bool Iso2022Active(DetectEncodingState* destatep) { |
| 1221 return (destatep->active_special & kIso2022Active) != 0; |
| 1222 } |
| 1223 static inline bool UTF8Active(DetectEncodingState* destatep) { |
| 1224 return (destatep->active_special & kUTF8Active) != 0; |
| 1225 } |
| 1226 static inline bool UTF8UTF8Active(DetectEncodingState* destatep) { |
| 1227 return (destatep->active_special & kUTF8UTF8Active) != 0; |
| 1228 } |
| 1229 static inline bool UTF1632Active(DetectEncodingState* destatep) { |
| 1230 return (destatep->active_special & kUTF1632Active) != 0; |
| 1231 } |
| 1232 static inline bool BinaryActive(DetectEncodingState* destatep) { |
| 1233 return (destatep->active_special & kBinaryActive) != 0; |
| 1234 } |
| 1235 static inline bool UTF7OrHzActive(DetectEncodingState* destatep) { |
| 1236 return (destatep->active_special & (kHzActive + kUTF7Active)) != 0; |
| 1237 } |
| 1238 static inline bool EUCJPActive(DetectEncodingState* destatep) { |
| 1239 return ((destatep->active_special & kEUCJPActive) != 0); |
| 1240 } |
| 1241 static inline bool OtherActive(DetectEncodingState* destatep) { |
| 1242 return (destatep->active_special & (kIso2022Active + kBinaryActive + |
| 1243 kUTF8Active + kUTF8UTF8Active + |
| 1244 kUTF1632Active + kEUCJPActive)) != 0; |
| 1245 } |
| 1246 |
| 1247 |
| 1248 static inline bool CEDFlagRescanning(CEDInternalFlags flags) { |
| 1249 return (flags & kCEDRescanning) != 0; |
| 1250 } |
| 1251 |
| 1252 static inline bool CEDFlagSlowscore(CEDInternalFlags flags) { |
| 1253 return (flags & kCEDSlowscore) != 0; |
| 1254 } |
| 1255 |
| 1256 static inline bool CEDFlagForceTags(CEDInternalFlags flags) { |
| 1257 return (flags & kCEDForceTags) != 0; |
| 1258 } |
| 1259 |
| 1260 |
| 1261 static inline int maxint(int a, int b) {return (a > b) ? a : b;} |
| 1262 static inline int minint(int a, int b) {return (a < b) ? a : b;} |
| 1263 |
| 1264 static inline const char* MyRankedEncName(int r_enc) { |
| 1265 return MyEncodingName(kMapToEncoding[r_enc]); |
| 1266 } |
| 1267 |
| 1268 |
| 1269 // Only for debugging. not thread safe |
| 1270 static const int kPsSourceWidth = 32; |
| 1271 static int pssourcenext = 0; // debug only. not threadsafe. dump only >= this |
| 1272 static int pssourcewidth = 0; // debug only. |
| 1273 static char* pssource_mark_buffer = NULL; |
| 1274 int next_do_src_line; |
| 1275 int do_src_offset[16]; |
| 1276 |
| 1277 |
| 1278 void PsSourceInit(int len) { |
| 1279 pssourcenext = 0; |
| 1280 pssourcewidth = len; |
| 1281 delete[] pssource_mark_buffer; |
| 1282 // Allocate 2 Ascii characters per input byte |
| 1283 pssource_mark_buffer = new char[(pssourcewidth * 2) + 8]; // 8 = overscan |
| 1284 memset(pssource_mark_buffer, ' ', pssourcewidth * 2); |
| 1285 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8); |
| 1286 |
| 1287 next_do_src_line = 0; |
| 1288 memset(do_src_offset, 0, sizeof(do_src_offset)); |
| 1289 } |
| 1290 |
| 1291 void PsSourceFinish() { |
| 1292 // Print preceding mark buffer |
| 1293 int j = (pssourcewidth * 2) - 1; |
| 1294 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim |
| 1295 pssource_mark_buffer[j + 1] = '\0'; |
| 1296 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer); |
| 1297 memset(pssource_mark_buffer, ' ', pssourcewidth * 2); |
| 1298 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8); |
| 1299 |
| 1300 delete[] pssource_mark_buffer; |
| 1301 pssource_mark_buffer = NULL; |
| 1302 } |
| 1303 |
| 1304 // Dump aligned len bytes src... if not already dumped |
| 1305 void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) { |
| 1306 int offset = src - isrc; |
| 1307 offset -= (offset % pssourcewidth); // round down to multiple of len bytes |
| 1308 if (offset < pssourcenext) { |
| 1309 return; |
| 1310 } |
| 1311 pssourcenext = offset + pssourcewidth; // Min offset for next dump |
| 1312 |
| 1313 // Print preceding mark buffer |
| 1314 int j = (pssourcewidth * 2) - 1; |
| 1315 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim |
| 1316 pssource_mark_buffer[j + 1] = '\0'; |
| 1317 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer); |
| 1318 memset(pssource_mark_buffer, ' ', pssourcewidth * 2); |
| 1319 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8); |
| 1320 |
| 1321 // Print source bytes |
| 1322 const uint8* src_aligned = isrc + offset; |
| 1323 int length = srclimit - src_aligned; |
| 1324 length = minint(pssourcewidth, length); |
| 1325 |
| 1326 fprintf(stderr, "(%05x ", offset); |
| 1327 for (int i = 0; i < length; ++i) { |
| 1328 char c = src_aligned[i]; |
| 1329 if (c == '\n') {c = ' ';} |
| 1330 if (c == '\r') {c = ' ';} |
| 1331 if (c == '\t') {c = ' ';} |
| 1332 if (c == '(') { |
| 1333 fprintf(stderr, "%s", "\\( "); |
| 1334 } else if (c == ')') { |
| 1335 fprintf(stderr, "%s", "\\) "); |
| 1336 } else if (c == '\\') { |
| 1337 fprintf(stderr, "%s", "\\\\ "); |
| 1338 } else if ((0x20 <= c) && (c <= 0x7e)) { |
| 1339 fprintf(stderr, "%c ", c); |
| 1340 } else { |
| 1341 fprintf(stderr, "%02x", c); |
| 1342 } |
| 1343 } |
| 1344 fprintf(stderr, ") do-src\n"); |
| 1345 // Remember which source offsets are where, mod 16 |
| 1346 do_src_offset[next_do_src_line & 0x0f] = offset; |
| 1347 ++next_do_src_line; |
| 1348 } |
| 1349 |
| 1350 // Mark bytes in just-previous source bytes |
| 1351 void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) { |
| 1352 int offset = src - isrc; |
| 1353 offset = (offset % pssourcewidth); // mod len bytes |
| 1354 char mark = (weightshift == 0) ? '-' : 'x'; |
| 1355 |
| 1356 pssource_mark_buffer[(offset * 2)] = '='; |
| 1357 pssource_mark_buffer[(offset * 2) + 1] = '='; |
| 1358 for (int i = 1; i < len; ++i) { |
| 1359 pssource_mark_buffer[(offset + i) * 2] = mark; |
| 1360 pssource_mark_buffer[((offset + i) * 2) + 1] = mark; |
| 1361 } |
| 1362 } |
| 1363 |
| 1364 |
| 1365 // Highlight trigram bytes in just-previous source bytes |
| 1366 // Unfortunately, we have to skip back N lines since source was printed for |
| 1367 // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better |
| 1368 void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) { |
| 1369 int offset = (src + 1) - isrc; |
| 1370 int offset32 = (offset % pssourcewidth); // mod len bytes |
| 1371 offset -= offset32; // round down to multiple of len bytes |
| 1372 |
| 1373 for (int i = 1; i <= 16; ++i) { |
| 1374 if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) { |
| 1375 fprintf(stderr, "%d %d %d do-highlight%d\n", |
| 1376 i, offset32 - 1, trigram_val, n); |
| 1377 break; |
| 1378 } |
| 1379 } |
| 1380 } |
| 1381 |
| 1382 |
| 1383 void InitDetectEncodingState(DetectEncodingState* destatep) { |
| 1384 destatep->initial_src = NULL; // Filled in by caller |
| 1385 destatep->limit_src = NULL; |
| 1386 destatep->prior_src = NULL; |
| 1387 destatep->last_pair = NULL; |
| 1388 |
| 1389 destatep->debug_data = NULL; |
| 1390 destatep->next_detail_entry = 0; |
| 1391 |
| 1392 destatep->done = false; |
| 1393 destatep->reliable = false; |
| 1394 destatep->hints_derated = false; |
| 1395 //destatep->declared_enc_1 init in ApplyHints |
| 1396 //destatep->declared_enc_2 init in ApplyHints |
| 1397 destatep->prune_count = 0; |
| 1398 |
| 1399 destatep->trigram_highwater_mark = 0; |
| 1400 destatep->looking_for_latin_trigrams = false; |
| 1401 destatep->do_latin_trigrams = false; |
| 1402 |
| 1403 // Miscellaneous state variables for difficult encodings |
| 1404 destatep->binary_quadrants_count = 0; |
| 1405 destatep->binary_8x4_count = 0; |
| 1406 destatep->binary_quadrants_seen = 0; |
| 1407 destatep->binary_8x4_seen = 0; |
| 1408 destatep->utf7_starts = 0; |
| 1409 destatep->prior_utf7_offset = 0; |
| 1410 destatep->next_utf8_ministate = 0; |
| 1411 for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;} |
| 1412 destatep->next_utf8utf8_ministate = 0; |
| 1413 destatep->utf8utf8_odd_byte = 0; |
| 1414 for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;} |
| 1415 destatep->next_2022_state = SOSI_NONE; |
| 1416 destatep->next_hz_state = SOSI_NONE; |
| 1417 destatep->next_eucjp_oddphase = false; |
| 1418 for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;} |
| 1419 destatep->active_special = 0xffffffff; |
| 1420 destatep->tld_hint = UNKNOWN_ENCODING; |
| 1421 destatep->http_hint = UNKNOWN_ENCODING; |
| 1422 destatep->meta_hint = UNKNOWN_ENCODING; |
| 1423 destatep->bom_hint = UNKNOWN_ENCODING; |
| 1424 destatep->top_rankedencoding = 0; // ASCII [seven-bit] is the default |
| 1425 destatep->second_top_rankedencoding = 0; // ASCII [seven-bit] is the default |
| 1426 destatep->top_prob = -1; |
| 1427 destatep->second_top_prob = -1; |
| 1428 // This is wide for first pruning, shrinks for 2nd and later |
| 1429 destatep->prune_difference = kInititalPruneDifference; |
| 1430 |
| 1431 destatep->next_prior_bigram = 0; |
| 1432 destatep->prior_bigram[0] = -1; |
| 1433 destatep->prior_bigram[1] = -1; |
| 1434 destatep->prior_bigram[2] = -1; |
| 1435 destatep->prior_bigram[3] = -1; |
| 1436 |
| 1437 destatep->prior_binary[0] = -1; |
| 1438 |
| 1439 // Initialize with all but Indic encodings, which we never detect |
| 1440 int k = 0; |
| 1441 for (int rankedencoding = 0; |
| 1442 rankedencoding < NUM_RANKEDENCODING; |
| 1443 rankedencoding++) { |
| 1444 Encoding enc = kMapToEncoding[rankedencoding]; |
| 1445 if (!IndicEncoding(enc)) { |
| 1446 destatep->rankedencoding_list[k++] = rankedencoding; |
| 1447 } |
| 1448 } |
| 1449 destatep->rankedencoding_list_len = k; |
| 1450 |
| 1451 // This is where all the action is |
| 1452 memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob)); |
| 1453 |
| 1454 memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob)); |
| 1455 memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight)); |
| 1456 |
| 1457 destatep->prior_interesting_pair[AsciiPair] = 0; |
| 1458 destatep->prior_interesting_pair[OtherPair] = 0; |
| 1459 destatep->next_interesting_pair[AsciiPair] = 0; |
| 1460 destatep->next_interesting_pair[OtherPair] = 0; |
| 1461 // interesting_pairs/offsets/weightshifts not initialized; no need |
| 1462 } |
| 1463 |
| 1464 // Probability strings are uint8, with zeros removed via simple run-length: |
| 1465 // (<skip-take byte> <data bytes>)* |
| 1466 // skip-take: |
| 1467 // 00 end |
| 1468 // x0 skip 16 x locations, take 0 data values |
| 1469 // xy skip x locations, take y data values |
| 1470 // Multiply all the incoming values by 3 to account for 3x unigram sums |
| 1471 // |
| 1472 // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35, |
| 1473 // 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255" |
| 1474 // |
| 1475 // Weight is 0..100 percent |
| 1476 // |
| 1477 // Returns subscript of largest (most probable) value |
| 1478 // |
| 1479 |
| 1480 |
| 1481 // {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3
b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__" |
| 1482 // // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49
Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASC
II-7-bit] |
| 1483 int ApplyCompressedProb(const char* iprob, int len, |
| 1484 int weight, DetectEncodingState* destatep) { |
| 1485 int* dst = &destatep->enc_prob[0]; |
| 1486 int* dst2 = &destatep->hint_weight[0]; |
| 1487 const uint8* prob = reinterpret_cast<const uint8*>(iprob); |
| 1488 const uint8* problimit = prob + len; |
| 1489 |
| 1490 int largest = -1; |
| 1491 int subscript_of_largest = 0; |
| 1492 |
| 1493 // Continue with first byte and subsequent ones |
| 1494 while (prob < problimit) { |
| 1495 int skiptake = *prob++; |
| 1496 int skip = (skiptake & 0xf0) >> 4; |
| 1497 int take = skiptake & 0x0f; |
| 1498 if (skiptake == 00) { |
| 1499 break; |
| 1500 } else if (take == 0) { |
| 1501 dst += (skip << 4); |
| 1502 dst2 += (skip << 4); |
| 1503 } else { |
| 1504 dst += skip; // Normal case |
| 1505 dst2 += skip; // Normal case |
| 1506 for (int i = 0; i < take; i++) { |
| 1507 int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i; |
| 1508 if (largest < prob[i]) { |
| 1509 largest = prob[i]; |
| 1510 subscript_of_largest = enc; |
| 1511 } |
| 1512 |
| 1513 int increment = prob[i] * 3; // The actual increment |
| 1514 |
| 1515 // Do maximum of previous hints plus this new one |
| 1516 if (weight > 0) { |
| 1517 increment = (increment * weight) / 100; |
| 1518 dst[i] = maxint(dst[i], increment); |
| 1519 dst2[i] = 1; // New total weight |
| 1520 } |
| 1521 } |
| 1522 prob += take; |
| 1523 dst += take; |
| 1524 dst2 += take; |
| 1525 } |
| 1526 } |
| 1527 return subscript_of_largest; |
| 1528 } |
| 1529 |
| 1530 |
| 1531 // Returns subscript of largest (most probable) value [for unit test] |
| 1532 int TopCompressedProb(const char* iprob, int len) { |
| 1533 const uint8* prob = reinterpret_cast<const uint8*>(iprob); |
| 1534 const uint8* problimit = prob + len; |
| 1535 int next_prob_sub = 0; |
| 1536 int topprob = 0; |
| 1537 int toprankenc = 0; |
| 1538 |
| 1539 while (prob < problimit) { |
| 1540 int skiptake = *prob++; |
| 1541 int skip = (skiptake & 0xf0) >> 4; |
| 1542 int take = skiptake & 0x0f; |
| 1543 if (skiptake == 0) { |
| 1544 break; |
| 1545 } else if (take == 0) { |
| 1546 next_prob_sub += (skip << 4); |
| 1547 } else { |
| 1548 next_prob_sub += skip; // Normal case |
| 1549 for (int i = 0; i < take; i++) { |
| 1550 if (topprob < prob[i]) { |
| 1551 topprob = prob[i]; |
| 1552 toprankenc = next_prob_sub + i; |
| 1553 } |
| 1554 } |
| 1555 prob += take; |
| 1556 next_prob_sub += take; |
| 1557 } |
| 1558 } |
| 1559 return toprankenc; |
| 1560 } |
| 1561 |
| 1562 |
| 1563 // Find subscript of matching key in first 8 bytes of sorted hint array, or -1 |
| 1564 int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize, |
| 1565 const char* norm_key) { |
| 1566 // Key is always in range [lo..hi) |
| 1567 int lo = 0; |
| 1568 int hi = hintprobssize; |
| 1569 while (lo < hi) { |
| 1570 int mid = (lo + hi) >> 1; |
| 1571 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8); |
| 1572 if (comp < 0) { |
| 1573 lo = mid + 1; |
| 1574 } else if (comp > 0) { |
| 1575 hi = mid; |
| 1576 } else { |
| 1577 return mid; |
| 1578 } |
| 1579 } |
| 1580 return -1; |
| 1581 } |
| 1582 |
| 1583 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1 |
| 1584 int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize, |
| 1585 const char* norm_key) { |
| 1586 // Key is always in range [lo..hi) |
| 1587 int lo = 0; |
| 1588 int hi = hintprobssize; |
| 1589 while (lo < hi) { |
| 1590 int mid = (lo + hi) >> 1; |
| 1591 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4); |
| 1592 if (comp < 0) { |
| 1593 lo = mid + 1; |
| 1594 } else if (comp > 0) { |
| 1595 hi = mid; |
| 1596 } else { |
| 1597 return mid; |
| 1598 } |
| 1599 } |
| 1600 return -1; |
| 1601 } |
| 1602 |
| 1603 static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) { |
| 1604 destatep->enc_prob[r_enc] += boost; |
| 1605 } |
| 1606 |
| 1607 static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) { |
| 1608 destatep->enc_prob[r_enc] -= whack; |
| 1609 } |
| 1610 |
| 1611 // Apply initial probability hint based on top level domain name |
| 1612 // Weight is 0..100 percent |
| 1613 // Return 1 if name match found |
| 1614 int ApplyTldHint(const char* url_tld_hint, int weight, |
| 1615 DetectEncodingState* destatep) { |
| 1616 if (url_tld_hint[0] == '~') { |
| 1617 return 0; |
| 1618 } |
| 1619 string normalized_tld = MakeChar4(string(url_tld_hint)); |
| 1620 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize, |
| 1621 normalized_tld.c_str()); |
| 1622 if (n >= 0) { |
| 1623 // TLD is four bytes, probability table is ~12 bytes |
| 1624 int best_sub = ApplyCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey], |
| 1625 kMaxTldVector, weight, destatep); |
| 1626 // Never boost ASCII7; do CP1252 instead |
| 1627 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;} |
| 1628 destatep->declared_enc_1 = best_sub; |
| 1629 if (destatep->debug_data != NULL) { |
| 1630 // Show TLD hint |
| 1631 SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint); |
| 1632 } |
| 1633 return 1; |
| 1634 } |
| 1635 return 0; |
| 1636 } |
| 1637 |
| 1638 // Apply initial probability hint based on charset= name |
| 1639 // Weight is 0..100 percent |
| 1640 // Return 1 if name match found |
| 1641 int ApplyCharsetHint(const char* charset_hint, int weight, |
| 1642 DetectEncodingState* destatep) { |
| 1643 if (charset_hint[0] == '~') { |
| 1644 return 0; |
| 1645 } |
| 1646 string normalized_charset = MakeChar44(string(charset_hint)); |
| 1647 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize, |
| 1648 normalized_charset.c_str()); |
| 1649 if (n >= 0) { |
| 1650 // Charset is eight bytes, probability table is ~eight bytes |
| 1651 int best_sub = ApplyCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharse
tKey], |
| 1652 kMaxCharsetVector, weight, destatep); |
| 1653 // Never boost ASCII7; do CP1252 instead |
| 1654 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;} |
| 1655 destatep->declared_enc_1 = best_sub; |
| 1656 |
| 1657 // If first explicitly declared charset is confusable with Latin1/1252, put |
| 1658 // both declared forms in declared_enc_*, displacing Latin1/1252. |
| 1659 // This avoids a bit of Latin1 creep. |
| 1660 // Also boost the declared encoding and its pair |
| 1661 // TODO (dsites) This should all be folded into postproc-enc-detect.cc |
| 1662 if ((destatep->http_hint == UNKNOWN_ENCODING) && |
| 1663 (destatep->meta_hint == UNKNOWN_ENCODING)) { |
| 1664 // This is the first charset=hint |
| 1665 switch (best_sub) { |
| 1666 case F_Latin2: // 8859-2 Latin2, east euro |
| 1667 destatep->declared_enc_2 = F_CP1250; |
| 1668 Boost(destatep, F_Latin2, kGentleOnePair); |
| 1669 Boost(destatep, F_CP1250, kGentleOnePair); |
| 1670 break; |
| 1671 case F_CP1250: |
| 1672 destatep->declared_enc_2 = F_Latin2; |
| 1673 Boost(destatep, F_Latin2, kGentleOnePair); |
| 1674 Boost(destatep, F_CP1250, kGentleOnePair); |
| 1675 break; |
| 1676 |
| 1677 case F_Latin3: // 8859-3 Latin3, south euro, Esperanto |
| 1678 destatep->declared_enc_2 = F_ASCII_7_bit; |
| 1679 Boost(destatep, F_Latin3, kGentleOnePair); |
| 1680 break; |
| 1681 |
| 1682 case F_Latin4: // 8859-4 Latin4, north euro |
| 1683 destatep->declared_enc_2 = F_ASCII_7_bit; |
| 1684 Boost(destatep, F_Latin4, kGentleOnePair); |
| 1685 break; |
| 1686 |
| 1687 case F_ISO_8859_5: // 8859-5 Cyrillic |
| 1688 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1251 |
| 1689 Boost(destatep, F_ISO_8859_5, kGentleOnePair); // (too different) |
| 1690 break; |
| 1691 case F_CP1251: |
| 1692 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost -5 |
| 1693 Boost(destatep, F_CP1251, kGentleOnePair); // (too different) |
| 1694 break; |
| 1695 |
| 1696 case F_Arabic: // 8859-6 Arabic |
| 1697 destatep->declared_enc_2 = F_CP1256; |
| 1698 Boost(destatep, F_Arabic, kGentleOnePair); |
| 1699 Boost(destatep, F_CP1256, kGentleOnePair); |
| 1700 break; |
| 1701 case F_CP1256: |
| 1702 destatep->declared_enc_2 = F_Arabic; |
| 1703 Boost(destatep, F_Arabic, kGentleOnePair); |
| 1704 Boost(destatep, F_CP1256, kGentleOnePair); |
| 1705 break; |
| 1706 |
| 1707 case F_Greek: // 8859-7 Greek |
| 1708 destatep->declared_enc_2 = F_CP1253; |
| 1709 Boost(destatep, F_Greek, kGentleOnePair); |
| 1710 Boost(destatep, F_CP1253, kGentleOnePair); |
| 1711 break; |
| 1712 case F_CP1253: |
| 1713 destatep->declared_enc_2 = F_Greek; |
| 1714 Boost(destatep, F_Greek, kGentleOnePair); |
| 1715 Boost(destatep, F_CP1253, kGentleOnePair); |
| 1716 break; |
| 1717 |
| 1718 case F_Hebrew: // 8859-8 Hebrew |
| 1719 destatep->declared_enc_2 = F_CP1255; |
| 1720 Boost(destatep, F_Hebrew, kGentleOnePair); |
| 1721 Boost(destatep, F_CP1255, kGentleOnePair); |
| 1722 break; |
| 1723 case F_CP1255: |
| 1724 destatep->declared_enc_2 = F_Hebrew; |
| 1725 Boost(destatep, F_Hebrew, kGentleOnePair); |
| 1726 Boost(destatep, F_CP1255, kGentleOnePair); |
| 1727 break; |
| 1728 |
| 1729 case F_Latin5: // 8859-9 Latin5, Turkish |
| 1730 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1254 |
| 1731 Boost(destatep, F_Latin5, kGentleOnePair); // (too different) |
| 1732 break; |
| 1733 case F_CP1254: |
| 1734 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost Latin5 |
| 1735 Boost(destatep, F_CP1254, kGentleOnePair); // (too different) |
| 1736 break; |
| 1737 |
| 1738 case F_Latin6: // 8859-10 Latin6, Nordic |
| 1739 destatep->declared_enc_2 = F_ASCII_7_bit; |
| 1740 Boost(destatep, F_Latin6, kGentleOnePair); |
| 1741 break; |
| 1742 |
| 1743 case F_ISO_8859_11: // 8859-11 Thai, |
| 1744 destatep->declared_enc_2 = F_CP874; |
| 1745 Boost(destatep, F_ISO_8859_11, kGentleOnePair); |
| 1746 Boost(destatep, F_CP874, kGentleOnePair); |
| 1747 break; |
| 1748 case F_CP874: |
| 1749 destatep->declared_enc_2 = F_ISO_8859_11; |
| 1750 Boost(destatep, F_ISO_8859_11, kGentleOnePair); |
| 1751 Boost(destatep, F_CP874, kGentleOnePair); |
| 1752 break; |
| 1753 |
| 1754 case F_ISO_8859_13: // 8859-13 Latin7, Baltic |
| 1755 destatep->declared_enc_2 = F_CP1257; |
| 1756 Boost(destatep, F_ISO_8859_13, kGentleOnePair); |
| 1757 Boost(destatep, F_CP1257, kGentleOnePair); |
| 1758 break; |
| 1759 case F_CP1257: |
| 1760 destatep->declared_enc_2 = F_ISO_8859_13; |
| 1761 Boost(destatep, F_ISO_8859_13, kGentleOnePair); |
| 1762 Boost(destatep, F_CP1257, kGentleOnePair); |
| 1763 break; |
| 1764 |
| 1765 case F_ISO_8859_15: // 8859-15 Latin9, Latin0, Euro-ized Latin1 |
| 1766 destatep->declared_enc_2 = F_ASCII_7_bit; |
| 1767 Boost(destatep, F_ISO_8859_15, kGentleOnePair); |
| 1768 break; |
| 1769 |
| 1770 |
| 1771 // Greek all-caps is confusable with KOI8x all-lower and Hebrew. |
| 1772 // This turns some Greek documents into Cyrillic, etc. by mistake. |
| 1773 // Greek and Hebrew are boosted explicitly above; do KOI8x here. |
| 1774 // Boosting the declared encodingmakes it harder for the wrong one to |
| 1775 // creep up. |
| 1776 case F_KOI8R: |
| 1777 Boost(destatep, F_KOI8R, kGentleOnePair); |
| 1778 break; |
| 1779 case F_KOI8U: |
| 1780 Boost(destatep, F_KOI8U, kGentleOnePair); |
| 1781 break; |
| 1782 |
| 1783 default: |
| 1784 break; |
| 1785 } |
| 1786 } |
| 1787 |
| 1788 if (destatep->debug_data != NULL) { |
| 1789 // Show charset hint |
| 1790 SetDetailsEncProb(destatep, 0, best_sub, charset_hint); |
| 1791 } |
| 1792 |
| 1793 // |
| 1794 // Some fix-ups for the declared encodings |
| 1795 // |
| 1796 |
| 1797 // If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos |
| 1798 // TODO (dsites) This should all be folded into postproc-enc-detect.cc |
| 1799 if ((best_sub != F_UTF8) && |
| 1800 (best_sub != F_Latin1) && |
| 1801 (best_sub != F_CP1252)) { |
| 1802 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4); // demote |
| 1803 } |
| 1804 |
| 1805 // Latin2 and CP1250 differ in the overlap part, such as B1 or B9 |
| 1806 // The initial probabilites for charset=Latin2 explicitly put CP1250 |
| 1807 // down twice as far as normal, and vice versa. This is done in |
| 1808 // postproc-enc-detect.cc |
| 1809 |
| 1810 // If charset=user-defined, treat as Binary -- |
| 1811 // we can safely only do low ASCII, might be Indic |
| 1812 if (normalized_charset.substr(0,4) == "user") { |
| 1813 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 1814 } |
| 1815 |
| 1816 return 1; |
| 1817 } |
| 1818 return 0; |
| 1819 } |
| 1820 |
| 1821 // Apply initial probability hint based on caller-supplied encoding |
| 1822 // Negative hint whacks ~encoding, non-negative boosts encoding |
| 1823 // |
| 1824 // Negative hints are an experiment to see if they might be useful. |
| 1825 // Not operator used instead of unary minus to allow specifying not-zero |
| 1826 int ApplyEncodingHint(const int encoding_hint, int weight, |
| 1827 DetectEncodingState* destatep) { |
| 1828 Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ? |
| 1829 ~encoding_hint : encoding_hint); |
| 1830 // Map to the right internal subscript |
| 1831 int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint); |
| 1832 |
| 1833 // I'm not sure how strong this hint should be. Weight 100% = 1 bigram |
| 1834 int increment = (kBoostOnePair * weight) / 100; |
| 1835 |
| 1836 if (encoding_hint < 0) { |
| 1837 destatep->enc_prob[rankedenc_hint] -= increment; |
| 1838 } else { |
| 1839 destatep->enc_prob[rankedenc_hint] += increment; |
| 1840 } |
| 1841 |
| 1842 if (destatep->debug_data != NULL) { |
| 1843 // Show encoding hint |
| 1844 SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint)); |
| 1845 } |
| 1846 return 1; |
| 1847 } |
| 1848 |
| 1849 // Apply initial probability hint based on user interface language |
| 1850 // Weight is 0..100 percent |
| 1851 // Return 1 if name match found |
| 1852 int ApplyUILangaugeHint(const Language language_hint, |
| 1853 int weight, DetectEncodingState* destatep) { |
| 1854 if (language_hint == UNKNOWN_LANGUAGE) { |
| 1855 return 0; |
| 1856 } |
| 1857 string normalized_lang = MakeChar8(LanguageName(language_hint)); |
| 1858 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize, |
| 1859 normalized_lang.c_str()); |
| 1860 if (n >= 0) { |
| 1861 // Language is eight bytes, probability table is ~eight bytes |
| 1862 int best_sub = ApplyCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey], |
| 1863 kMaxLangVector, weight, destatep); |
| 1864 // Never boost ASCII7; do CP1252 instead |
| 1865 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;} |
| 1866 destatep->declared_enc_1 = best_sub; |
| 1867 if (destatep->debug_data != NULL) { |
| 1868 // Show language hint |
| 1869 SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str()); |
| 1870 } |
| 1871 return 1; |
| 1872 } |
| 1873 return 0; |
| 1874 } |
| 1875 |
| 1876 // Apply initial probability hint based on corpus type (web, email, etc) |
| 1877 // Weight is 0..100 percent IGNORED |
| 1878 // Return 1 if name match found |
| 1879 int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type, |
| 1880 int weight, DetectEncodingState* destatep) { |
| 1881 |
| 1882 for (int i = 0; i < NUM_RANKEDENCODING; i++) { |
| 1883 // Set the default probability |
| 1884 destatep->enc_prob[i] = kDefaultProb[i] * 3; |
| 1885 // Deliberately set 2022 seven-bit encodings to zero, |
| 1886 // so we can look for actual use |
| 1887 // TODO (dsites) This should all be folded into postproc-enc-detect.cc |
| 1888 if (SevenBitEncoding(kMapToEncoding[i])) { |
| 1889 destatep->enc_prob[i] = 0; |
| 1890 } |
| 1891 } |
| 1892 |
| 1893 // A little corpus distinction |
| 1894 switch (corpus_type) { |
| 1895 case CompactEncDet::WEB_CORPUS: |
| 1896 case CompactEncDet::XML_CORPUS: |
| 1897 // Allow double-converted UTF-8 to start nearly equal to normal UTF-8 |
| 1898 destatep->enc_prob[F_UTF8UTF8] = |
| 1899 destatep->enc_prob[F_UTF8] - kSmallInitDiff; |
| 1900 break; |
| 1901 case CompactEncDet::QUERY_CORPUS: |
| 1902 case CompactEncDet::EMAIL_CORPUS: |
| 1903 default: |
| 1904 break; |
| 1905 } |
| 1906 |
| 1907 if (FLAGS_demo_nodefault) { |
| 1908 // Demo, make initial probs all zero |
| 1909 for (int i = 0; i < NUM_RANKEDENCODING; i++) { |
| 1910 destatep->enc_prob[i] = 0; |
| 1911 } |
| 1912 } |
| 1913 |
| 1914 if (destatep->debug_data != NULL) { |
| 1915 // Show default hint |
| 1916 SetDetailsEncProb(destatep, 0, -1, "Default"); |
| 1917 } |
| 1918 return 1; |
| 1919 } |
| 1920 |
| 1921 |
| 1922 |
| 1923 // Do reverse search for c in [str..str+len) |
| 1924 // Note: initial pointer is to FRONT of string, not back |
| 1925 const char* MyMemrchr(const char* str, char c, size_t len) { |
| 1926 const char* ret = str + len; |
| 1927 while (str <= --ret) { |
| 1928 if (*ret == c) {return ret;} |
| 1929 } |
| 1930 return NULL; |
| 1931 } |
| 1932 |
| 1933 |
| 1934 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD |
| 1935 // Now that we are no longer trying to do Indic font-based encodigns, we |
| 1936 // don't need the full URL and can go back to simple TLD. This test remains for |
| 1937 // backwards compatility with any caller using full URL. |
| 1938 static const int kMinURLLength = 11; |
| 1939 |
| 1940 // Extract TLD from a full URL or just a TLD |
| 1941 // Return hostname and length if a full URL |
| 1942 void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len, |
| 1943 const char** ret_host_start, int* ret_host_len) { |
| 1944 // url_hint can either be a full URL (preferred) or just top-level domain name |
| 1945 // Extract the TLD from a full URL and use it for |
| 1946 // a normal TLD hint |
| 1947 |
| 1948 strncpy(tld_hint, "~", tld_hint_len); |
| 1949 tld_hint[tld_hint_len - 1] = '\0'; |
| 1950 *ret_host_start = NULL; |
| 1951 *ret_host_len = 0; |
| 1952 |
| 1953 int url_len = (url_hint != NULL) ? strlen(url_hint) : 0; |
| 1954 if (url_len == 0) { |
| 1955 // Empty TLD |
| 1956 return; |
| 1957 } |
| 1958 |
| 1959 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD |
| 1960 if (kMinURLLength <= url_len) { |
| 1961 // See if it really is a URL |
| 1962 const char* first_slash = strchr(url_hint, '/'); |
| 1963 if ((first_slash != NULL) && (first_slash != url_hint) && |
| 1964 (first_slash[-1] == ':') && (first_slash[1] == '/') && |
| 1965 (memrchr(url_hint, '.', first_slash - url_hint) == NULL)) { |
| 1966 // We found :// and no dot in front of it, so declare a real URL |
| 1967 |
| 1968 const char* hostname_start = first_slash + 2; |
| 1969 const char* hostname_end = strchr(hostname_start, '/'); |
| 1970 if (hostname_end == NULL) { |
| 1971 // No slash; end is first byte off end of the URL string |
| 1972 hostname_end = url_hint + url_len; |
| 1973 } |
| 1974 size_t hostname_len = hostname_end - hostname_start; |
| 1975 const char* port_start = |
| 1976 (const char*)memchr(hostname_start, ':', hostname_len); |
| 1977 if (port_start != NULL) { |
| 1978 // Port; shorten hostname |
| 1979 hostname_end = port_start; |
| 1980 hostname_len = hostname_end - hostname_start; |
| 1981 } |
| 1982 |
| 1983 const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len); |
| 1984 if (tld_start != NULL) { |
| 1985 // Remember the TLD we just found |
| 1986 int tld_len = hostname_start + hostname_len - tld_start - 1; |
| 1987 if (tld_len > (tld_hint_len - 1)) { |
| 1988 tld_len = tld_hint_len - 1; |
| 1989 } |
| 1990 memcpy(tld_hint, tld_start + 1, tld_len); |
| 1991 tld_hint[tld_len] = '\0'; |
| 1992 } |
| 1993 *ret_host_start = hostname_start; |
| 1994 *ret_host_len = hostname_len; |
| 1995 return; |
| 1996 } |
| 1997 } else { |
| 1998 strncpy(tld_hint, url_hint, tld_hint_len); |
| 1999 tld_hint[tld_hint_len - 1] = '\0'; |
| 2000 } |
| 2001 } |
| 2002 |
| 2003 // Apply hints, if any, to probabilities |
| 2004 // NOTE: Encoding probabilites are all zero at this point |
| 2005 void ApplyHints(const char* url_hint, |
| 2006 const char* http_charset_hint, |
| 2007 const char* meta_charset_hint, |
| 2008 const int encoding_hint, |
| 2009 const Language language_hint, |
| 2010 const CompactEncDet::TextCorpusType corpus_type, |
| 2011 DetectEncodingState* destatep) { |
| 2012 int hint_count = 0; |
| 2013 // url_hint can either be a full URL (preferred) or just top-level domain name |
| 2014 // Extract the TLD from a full URL and use it for |
| 2015 // a normal TLD hint |
| 2016 |
| 2017 char tld_hint[16]; |
| 2018 const char* hostname_start = NULL; |
| 2019 int hostname_len = 0; |
| 2020 ExtractTLD(url_hint, tld_hint, sizeof(tld_hint), |
| 2021 &hostname_start, &hostname_len); |
| 2022 |
| 2023 |
| 2024 // Initial hints give slight boost to Ascii-7-bit and code page 1252 |
| 2025 // ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1 |
| 2026 // This gives a boost to 1252 if one of HTTP/META is specified, |
| 2027 // but this could be the wrong thing to do if Latin2/3/4/etc. is specified |
| 2028 destatep->declared_enc_1 = F_CP1252; |
| 2029 destatep->declared_enc_2 = F_ASCII_7_bit; |
| 2030 |
| 2031 // Applying various hints takes max of new hint and any old hint. |
| 2032 // This does better on multiple hints that a weighted average |
| 2033 |
| 2034 // Weight is 0..100 percent |
| 2035 if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) { |
| 2036 destatep->declared_enc_2 = destatep->declared_enc_1; |
| 2037 hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep); |
| 2038 destatep->http_hint = kMapToEncoding[destatep->declared_enc_1]; |
| 2039 if ((destatep->declared_enc_1 == F_CP1252) || |
| 2040 (destatep->declared_enc_1 == F_Latin1)) { |
| 2041 destatep->looking_for_latin_trigrams = true; |
| 2042 } |
| 2043 } |
| 2044 if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) { |
| 2045 destatep->declared_enc_2 = destatep->declared_enc_1; |
| 2046 hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep); |
| 2047 destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1]; |
| 2048 if ((destatep->declared_enc_1 == F_CP1252) || |
| 2049 (destatep->declared_enc_1 == F_Latin1)) { |
| 2050 destatep->looking_for_latin_trigrams = true; |
| 2051 } |
| 2052 } |
| 2053 if (encoding_hint != UNKNOWN_ENCODING) { |
| 2054 destatep->declared_enc_2 = destatep->declared_enc_1; |
| 2055 hint_count += ApplyEncodingHint(encoding_hint, 50, destatep); |
| 2056 } |
| 2057 if (language_hint != UNKNOWN_LANGUAGE) { |
| 2058 destatep->declared_enc_2 = destatep->declared_enc_1; |
| 2059 hint_count += ApplyUILangaugeHint(language_hint, 50, destatep); |
| 2060 } |
| 2061 // Use top level domain if not .com and <=1 other hint was available |
| 2062 if (url_hint != NULL) { |
| 2063 destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint); |
| 2064 if (hint_count == 0) { |
| 2065 // Apply with weight 100% |
| 2066 destatep->declared_enc_2 = destatep->declared_enc_1; |
| 2067 hint_count += ApplyTldHint(tld_hint, 100, destatep); |
| 2068 if ((destatep->declared_enc_1 == F_CP1252) || |
| 2069 (destatep->declared_enc_1 == F_Latin1)) { |
| 2070 destatep->looking_for_latin_trigrams = true; |
| 2071 } |
| 2072 if (strcmp("hu", tld_hint) == 0) { |
| 2073 // Hungarian is particularly difficult to separate Latin2 from Latin1, |
| 2074 // so always look for trigram scanning if bare TLD=hu hint |
| 2075 destatep->looking_for_latin_trigrams = true; |
| 2076 } |
| 2077 // Treat .com as no TLD hint at all |
| 2078 } else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) { |
| 2079 // Either shift weighting or consider doing no TLD here -- seems to |
| 2080 // distract from correct charset= hints. Or perhaps apply only if |
| 2081 // charset = Latin1/1252... |
| 2082 // Apply with weight 50% |
| 2083 destatep->declared_enc_2 = destatep->declared_enc_1; |
| 2084 hint_count += ApplyTldHint(tld_hint, 50, destatep); |
| 2085 if ((destatep->declared_enc_1 == F_CP1252) || |
| 2086 (destatep->declared_enc_1 == F_Latin1)) { |
| 2087 destatep->looking_for_latin_trigrams = true; // These need trigrams |
| 2088 } |
| 2089 } |
| 2090 // Else ignore TLD hint entirely |
| 2091 } |
| 2092 |
| 2093 // Use all-web default distribution if not even a TLD hint |
| 2094 if (hint_count == 0) { |
| 2095 destatep->looking_for_latin_trigrams = true; // Default needs trigrams |
| 2096 destatep->declared_enc_2 = destatep->declared_enc_1; |
| 2097 hint_count += ApplyDefaultHint(corpus_type, 100, destatep); |
| 2098 } |
| 2099 |
| 2100 |
| 2101 // ISO-Microsoft Pairs |
| 2102 // F_Latin1, F_CP1252, |
| 2103 // F_Latin2, F_CP1250, NOT really strict subset/superset pairs |
| 2104 // F_Latin3, |
| 2105 // F_Latin4, |
| 2106 // F_ISO_8859_5, F_CP1251, |
| 2107 // F_Arabic, F_CP1256, NOT |
| 2108 // F_Greek, F_CP1253, NOT really pairs |
| 2109 // (or upgrade incvt to make Greek use CP) |
| 2110 // F_Hebrew, F_CP1255, NOT really pairs |
| 2111 // F_Latin5, F_CP1254, |
| 2112 // F_Latin6, |
| 2113 // F_ISO_8859_11, |
| 2114 // F_ISO_8859_13, F_CP1257, |
| 2115 // F_ISO_8859_15, |
| 2116 // ISO-Microsoft Pairs |
| 2117 |
| 2118 // Get important families started together |
| 2119 // // This should fall out of the initializatoin vectors for charset, |
| 2120 // but we need to get rid of families alltogetrher |
| 2121 // |
| 2122 // TODO make this more graceful |
| 2123 |
| 2124 // Add small bias for subsets |
| 2125 |
| 2126 // Subtract small bias for supersets |
| 2127 destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff; |
| 2128 |
| 2129 destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff; |
| 2130 destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff; |
| 2131 |
| 2132 destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] - |
| 2133 kSmallInitDiff; |
| 2134 destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] - |
| 2135 kSmallInitDiff; |
| 2136 |
| 2137 // Deliberate over-bias Ascii7 and underbias Binary [unneeded] |
| 2138 // destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSm
allInitDiff; |
| 2139 // destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitia
l / 2); |
| 2140 |
| 2141 if (destatep->debug_data != NULL) { |
| 2142 // Show state at end of hints |
| 2143 SetDetailsEncProb(destatep, 0, -1, "Endhints"); |
| 2144 if(FLAGS_enc_detect_detail2) { |
| 2145 // Add a line showing the watched encoding(s) |
| 2146 if (watch1_rankedenc >= 0) { |
| 2147 SetDetailsEncProb(destatep, 0, |
| 2148 watch1_rankedenc, FLAGS_enc_detect_watch1.c_str()); |
| 2149 } |
| 2150 if (watch2_rankedenc >= 0) { |
| 2151 SetDetailsEncProb(destatep, 0, |
| 2152 watch2_rankedenc, FLAGS_enc_detect_watch2.c_str()); |
| 2153 } |
| 2154 } // End detail2 |
| 2155 } |
| 2156 |
| 2157 // If duplicate hints, set second one to ASCII_7BIT to prevent double-boost |
| 2158 if (destatep->declared_enc_1 == destatep->declared_enc_2) { |
| 2159 destatep->declared_enc_2 = F_ASCII_7_bit; |
| 2160 } |
| 2161 |
| 2162 if (FLAGS_force127) { |
| 2163 destatep->do_latin_trigrams = true; |
| 2164 if (FLAGS_enc_detect_source) { |
| 2165 PsHighlight(0, destatep->initial_src, 0, 2); |
| 2166 } |
| 2167 } |
| 2168 |
| 2169 |
| 2170 if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;} |
| 2171 if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;} |
| 2172 |
| 2173 // |
| 2174 // At this point, destatep->enc_prob[] is an initial probability vector based |
| 2175 // on the given hints/default. In general, it spreads out least-likely |
| 2176 // encodings to be about 2**-25 below the most-likely encoding. |
| 2177 // For input text with lots of bigrams, an unlikely encoding can rise to |
| 2178 // the top at a rate of about 2**6 per bigram, and more commonly 2**2 per |
| 2179 // bigram. So more than 4 bigrams and commonly more than 12 are |
| 2180 // needed to overcome the initial hints when the least-likely encoding |
| 2181 // is in fact the correct answer. So if the entire text has very few bigrams |
| 2182 // (as a two-word query might), it can be impossible for the correct |
| 2183 // encoding to win. |
| 2184 // |
| 2185 // To compensate for this, we take the initial hint vector and effectively |
| 2186 // apply it at the rate of 1/16 every bigram for the first 16 bigrams. The |
| 2187 // actual mechanism is done just before the last prune. |
| 2188 // |
| 2189 |
| 2190 // Remember Initial hint probabilities |
| 2191 memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob)); |
| 2192 } |
| 2193 |
| 2194 // Look for specific high-value patterns in the first 4 bytes |
| 2195 // Byte order marks (BOM) |
| 2196 // EFBBBF UTF-8 |
| 2197 // FEFF UTF-16 BE |
| 2198 // FFFE UTF-16 LE |
| 2199 // FFFE0000 UTF-32 BE |
| 2200 // 0000FEFF UTF-32 LE |
| 2201 // |
| 2202 // Likely UTF-x of seven-bit ASCII |
| 2203 // 00xx UTF-16 BE xx printable ASCII |
| 2204 // xx00 UTF-16 LE |
| 2205 // 000000xx UTF-32 BE |
| 2206 // xx000000 UTF-32 LE |
| 2207 // |
| 2208 void InitialBytesBoost(const uint8* src, |
| 2209 int text_length, |
| 2210 DetectEncodingState* destatep) { |
| 2211 if (text_length < 4) {return;} |
| 2212 |
| 2213 char32 pair01 = (src[0] << 8) | src[1]; |
| 2214 char32 pair23 = (src[2] << 8) | src[3]; |
| 2215 char32 quad0123 = (pair01 << 16) | pair23; |
| 2216 |
| 2217 bool utf_16_indication = false; |
| 2218 bool utf_32_indication = false; |
| 2219 int best_enc = -1; |
| 2220 |
| 2221 // Byte order marks |
| 2222 // UTF-8 |
| 2223 if ((quad0123 & 0xffffff00) == 0xEFBBBF00) { |
| 2224 destatep->bom_hint = UTF8; |
| 2225 Boost(destatep, F_UTF8, kBoostInitial * 2); |
| 2226 Boost(destatep, F_UTF8UTF8, kBoostInitial * 2); |
| 2227 best_enc = F_UTF8; |
| 2228 // UTF-32 (test before UTF-16) |
| 2229 } else if (quad0123 == 0x0000FEFF) { |
| 2230 destatep->bom_hint = UTF32BE; |
| 2231 Boost(destatep, F_UTF_32BE, kBoostInitial * 2); |
| 2232 best_enc = F_UTF_32BE; |
| 2233 } else if (quad0123 == 0xFFFE0000) { |
| 2234 destatep->bom_hint = UTF32LE; |
| 2235 Boost(destatep, F_UTF_32LE, kBoostInitial * 2); |
| 2236 best_enc = F_UTF_32LE; |
| 2237 // UTF-16 |
| 2238 } else if (pair01 == 0xFEFF) { |
| 2239 destatep->bom_hint = UTF16BE; |
| 2240 Boost(destatep, F_UTF_16BE, kBoostInitial * 3); |
| 2241 best_enc = F_UTF_16BE; |
| 2242 } else if (pair01 == 0xFFFE) { |
| 2243 destatep->bom_hint = UTF16LE; |
| 2244 Boost(destatep, F_UTF_16LE, kBoostInitial * 3); |
| 2245 best_enc = F_UTF_16LE; |
| 2246 |
| 2247 // Possible seven-bit ASCII encoded as UTF-16/32 |
| 2248 // UTF-32 (test before UTF-16) |
| 2249 } else if (((quad0123 & 0xffffff00) == 0) && |
| 2250 (kIsPrintableAscii[src[3]] != 0)) { |
| 2251 Boost(destatep, F_UTF_32BE, kBoostInitial); |
| 2252 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal char |
| 2253 best_enc = F_UTF_32BE; |
| 2254 } else if (((quad0123 & 0x00ffffff) == 0) && |
| 2255 (kIsPrintableAscii[src[0]] != 0)) { |
| 2256 Boost(destatep, F_UTF_32LE, kBoostInitial); |
| 2257 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char |
| 2258 best_enc = F_UTF_32LE; |
| 2259 } else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) { |
| 2260 Boost(destatep, F_UTF_16BE, kBoostInitial); |
| 2261 best_enc = F_UTF_16BE; |
| 2262 } else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) { |
| 2263 Boost(destatep, F_UTF_16LE, kBoostInitial); |
| 2264 best_enc = F_UTF_16LE; |
| 2265 |
| 2266 // Whack if 0000 or FFFF |
| 2267 // UTF-32 (test before UTF-16) |
| 2268 } else if (quad0123 == 0x00000000) { |
| 2269 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char |
| 2270 Whack(destatep, F_UTF_32LE, kBadPairWhack); |
| 2271 Whack(destatep, F_UTF_16BE, kBadPairWhack); |
| 2272 Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| 2273 best_enc = -1; |
| 2274 } else if (quad0123 == 0xffffffff) { |
| 2275 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char |
| 2276 Whack(destatep, F_UTF_32LE, kBadPairWhack); |
| 2277 Whack(destatep, F_UTF_16BE, kBadPairWhack); |
| 2278 Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| 2279 best_enc = -1; |
| 2280 } else if (pair01 == 0x0000) { |
| 2281 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char |
| 2282 Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| 2283 best_enc = -1; |
| 2284 } else if (pair01 == 0xffff) { |
| 2285 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char |
| 2286 Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| 2287 best_enc = -1; |
| 2288 |
| 2289 |
| 2290 // These are the first four bytes of some known binary file formats |
| 2291 |
| 2292 // Boost BINARY bigtime if JPEG FFD8FFxx |
| 2293 // Boost BINARY bigtime if png 89504E47 (.PNG) |
| 2294 // Boost BINARY bigtime if gif 47494638 (GIF8) |
| 2295 // Boost BINARY bigtime if zip 504B0304 (PK..) |
| 2296 // Boost BINARY bigtime if gzip 1F8B08xx |
| 2297 // Boost BINARY bigtime if gzip 78DAxxxx |
| 2298 // Boost BINARY if PDF 25504446 (%PDF) |
| 2299 // Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f) |
| 2300 } else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) { // JPEG FFD8FFxx |
| 2301 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2302 } else if (quad0123 == 0x89504E47) { // Hex 89 P N G |
| 2303 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2304 } else if (quad0123 == 0x47494638) { // Hex GIF8 |
| 2305 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2306 } else if (quad0123 == 0x504B0304) { // Hex P K 03 04 |
| 2307 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2308 } else if ((quad0123 & 0xffffff00) == 0x1F8B0800) { // gzip 1F8B08xx |
| 2309 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2310 } else if (pair01 == 0x78DA) { // gzip 78DAxxxx |
| 2311 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2312 } else if (quad0123 == 0x25504446) { // Hex %PDF |
| 2313 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2314 } else if ((quad0123 & 0xffffff1f) == 0x66535700) { // Hex FWSx |
| 2315 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2316 } else if ((quad0123 & 0xffffff1f) == 0x63535700) { // Hex CWSx |
| 2317 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2318 |
| 2319 // More binary detect prefixes |
| 2320 // 7F E L F Executable and linking format |
| 2321 // M M 00 * TIFF (little-endian) |
| 2322 // * 00 M M TIFF (big-endian) |
| 2323 // 01 f c p Final cut pro |
| 2324 } else if (quad0123 == 0x7F454C46) { // Hex 7F E L F |
| 2325 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2326 } else if (quad0123 == 0x4D4D002A) { // Hex M M 00 * |
| 2327 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2328 } else if (quad0123 == 0x2A004D4D) { // Hex * 00 M M |
| 2329 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2330 } else if (quad0123 == 0x01666370) { // Hex 01 f c p |
| 2331 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2332 |
| 2333 // More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII |
| 2334 // prefix overcoming binary |
| 2335 // C C S D USGS ISIS 3-D cube files |
| 2336 // S I M P FITS image header "SIMPLE " |
| 2337 } else if (quad0123 == 0x43435344) { // Hex C C S D |
| 2338 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2339 } else if (quad0123 == 0x53494D50) { // Hex S I M P |
| 2340 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2341 |
| 2342 // More binary detect prefixes; all-ASCII names; lighter weight |
| 2343 // H W P Hangul word processor |
| 2344 // 8 B P S Photoshop |
| 2345 // P D S _ xx "PDS_VERSION_ID " |
| 2346 } else if (quad0123 == 0x48575020) { // Hex H W P |
| 2347 if ((19 <= text_length) && |
| 2348 (memcmp(src, "HWP.Document.File.V", 19) == 0)) { |
| 2349 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2350 } else if ((19 <= text_length) && |
| 2351 (memcmp(src, "HWP Document File V", 19) == 0)) { |
| 2352 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2353 } else { |
| 2354 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary); |
| 2355 } |
| 2356 } else if (quad0123 == 0x38425053) { // Hex 8 B P S |
| 2357 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2358 } else if (quad0123 == 0x5044535F) { // Hex P D S _ |
| 2359 if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) { |
| 2360 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| 2361 } else { |
| 2362 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary); |
| 2363 } |
| 2364 } |
| 2365 |
| 2366 // There are several main Windows EXE file formats. |
| 2367 // Not examined here (prefix too short; never see them in Google pipeline) |
| 2368 // M Z DOS .exe Mark Zbikowski |
| 2369 // N E DOS 4.0 16-bit |
| 2370 // L E OS/2 VxD drivers |
| 2371 // L X OS/2 |
| 2372 // P E Windows NT |
| 2373 |
| 2374 |
| 2375 // More user-defined |
| 2376 // http://www.freenet.am/armscii/ Armenian |
| 2377 |
| 2378 // If any hints or BOM, etc. keep UTF 16/32 around |
| 2379 if ((destatep->enc_prob[F_UTF_16BE] > 0) || |
| 2380 (destatep->enc_prob[F_UTF_16LE] > 0)) { |
| 2381 utf_16_indication = true; |
| 2382 } |
| 2383 if ((destatep->enc_prob[F_UTF_32BE] > 0) || |
| 2384 (destatep->enc_prob[F_UTF_32LE] > 0)) { |
| 2385 utf_32_indication = true; |
| 2386 } |
| 2387 |
| 2388 |
| 2389 // Kill UTF16/32 right now if no positive indication of them |
| 2390 // Otherwise, they tend to rise to the top in 7-bit files with an |
| 2391 // occasional 0x02 byte in some comment or javascript |
| 2392 if (!utf_16_indication) { |
| 2393 Whack(destatep, F_UTF_16BE, kBadPairWhack * 8); |
| 2394 Whack(destatep, F_UTF_16LE, kBadPairWhack * 8); |
| 2395 Whack(destatep, F_Unicode, kBadPairWhack * 8); |
| 2396 } |
| 2397 if (!utf_32_indication) { |
| 2398 Whack(destatep, F_UTF_32BE, kBadPairWhack * 8); |
| 2399 Whack(destatep, F_UTF_32LE, kBadPairWhack * 8); |
| 2400 } |
| 2401 |
| 2402 // Usually kill mixed encodings |
| 2403 if (!FLAGS_ced_allow_utf8utf8) { |
| 2404 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); |
| 2405 } |
| 2406 // 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead |
| 2407 Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8); |
| 2408 |
| 2409 if (destatep->debug_data != NULL) { |
| 2410 // Show first four bytes of the input |
| 2411 char buff[16]; |
| 2412 snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23); |
| 2413 SetDetailsEncProb(destatep, 0, best_enc, buff); |
| 2414 } |
| 2415 } |
| 2416 |
| 2417 |
| 2418 |
| 2419 // Descending order |
| 2420 int IntCompare(const void* v1, const void* v2) { |
| 2421 const int* p1 = reinterpret_cast<const int*>(v1); |
| 2422 const int* p2 = reinterpret_cast<const int*>(v2); |
| 2423 if (*p1 < *p2) {return 1;} |
| 2424 if (*p1 > *p2) {return -1;} |
| 2425 return 0; |
| 2426 } |
| 2427 |
| 2428 bool Base64Char(uint8 c) { |
| 2429 if (('A' <= c) && (c <= 'Z')) {return true;} |
| 2430 if (('a' <= c) && (c <= 'z')) {return true;} |
| 2431 if (('0' <= c) && (c <= '9')) {return true;} |
| 2432 if ('+' == c) {return true;} |
| 2433 if ('/' == c) {return true;} |
| 2434 return false; |
| 2435 } |
| 2436 |
| 2437 int Base64ScanLen(const uint8* start, const uint8* limit) { |
| 2438 // We have a plausible beginning; scan entire base64 string |
| 2439 const uint8* ib64str = start; |
| 2440 const uint8* b64str = ib64str; |
| 2441 const uint8* b64strlimit = limit; |
| 2442 // if starts with + +++, assume it is drawing, so bogus |
| 2443 if (((limit - start) > 3) && (start[0] == '+') && |
| 2444 (start[1] == '+') && (start[2] == '+')) { |
| 2445 return 81; |
| 2446 } |
| 2447 // Scan over base64 |
| 2448 while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0)) { |
| 2449 } |
| 2450 b64str--; // We overshot by 1 |
| 2451 return b64str - ib64str; |
| 2452 } |
| 2453 |
| 2454 // Input is at least 8-character legal base64 string after +. |
| 2455 // But might be say + "Presse+Termine" |
| 2456 bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) { |
| 2457 // Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64) |
| 2458 // Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64) |
| 2459 // Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64) |
| 2460 // Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64) |
| 2461 // NOTE: this requires at least one lower AND one upper AND one digit to pass |
| 2462 // |
| 2463 int plus_count = 0; |
| 2464 int lower_count = 0; |
| 2465 int upper_count = 0; |
| 2466 int digit_count = 0; |
| 2467 int len = limit - start; |
| 2468 for (const uint8* src = start; src < limit; ++src) { |
| 2469 uint8 c = *src; |
| 2470 if (('a' <= c) && (c <= 'z')) { |
| 2471 ++lower_count; |
| 2472 } else if (('A' <= c) && (c <= 'Z')) { |
| 2473 ++upper_count; |
| 2474 } else if (('0' <= c) && (c <= '0')) { |
| 2475 ++digit_count; |
| 2476 } else if (*src == '+') { |
| 2477 ++plus_count; |
| 2478 } |
| 2479 } |
| 2480 |
| 2481 if (plus_count > (1 + (len >> 4))) {return false;} |
| 2482 if (lower_count < (1 + (len >> 4))) {return false;} |
| 2483 if (upper_count < (1 + (len >> 4))) {return false;} |
| 2484 if (digit_count < (1 + (len >> 5))) {return false;} |
| 2485 |
| 2486 // checking the last character to reduce false positive |
| 2487 // since the last character may be padded to 0 bits at the end. |
| 2488 // refer to http://en.wikipedia.org/wiki/UTF-7 |
| 2489 int nmod8 = len & 7; |
| 2490 const uint8 last = *(start+len-1); |
| 2491 // When UTF-7 string length%8=3, the last two bits must be padded as 0 |
| 2492 if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;} |
| 2493 // When UTF-7 string length%8=6, the last four bits must be padded as 0 |
| 2494 if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;} |
| 2495 return true; |
| 2496 } |
| 2497 |
| 2498 // Prune here after N bytes |
| 2499 // Boost here for seven-bit sequences (at every prune) |
| 2500 // if (sevenbitrankedencoding) |
| 2501 // + UTF7 scan and boost/demote len mod 8 = 0 3 6 |
| 2502 // ~ Hz scan and boost/demote len mod 8 = 0 2 4 6 |
| 2503 // 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6 |
| 2504 // 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6 |
| 2505 // [0F 2022 boost/demote] |
| 2506 // 00 UTF16/32 scan and boost/demote offset = even/odd |
| 2507 // |
| 2508 // If still some seven-bit possibilities > pure ASCII, |
| 2509 // scan each possibility for clearer prob, s.t. about |
| 2510 // two good sequences is a clear win |
| 2511 // A-Z 00-19 00xx-64xx (B = 04xx) |
| 2512 // a-z 1A-33 68xx-CCxx (f = 7Cxx) |
| 2513 // 0-9 34-3D D0xx-F4xx (1 = D4xx) |
| 2514 // + 3E F8xx |
| 2515 // / 3F FCxx |
| 2516 // do another chunk with slow scan |
| 2517 |
| 2518 |
| 2519 // Boost, whack, or leave alone UTF-7 probablilty |
| 2520 void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) { |
| 2521 int off = destatep->interesting_offsets[AsciiPair][next_pair]; |
| 2522 if (off >= destatep->prior_utf7_offset) { |
| 2523 // Not part of a previous successful UTF-7 string |
| 2524 ++destatep->utf7_starts; |
| 2525 |
| 2526 if (byte2 == '-') { |
| 2527 // +- encoding for '+' neutral |
| 2528 } else if (!Base64Char(byte2)) { |
| 2529 // Not base64 -- not UTF-7, whack |
| 2530 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal pair |
| 2531 } else { |
| 2532 // Starts with base64 byte, might be a good UTF7 sequence |
| 2533 const uint8* start = destatep->initial_src + off + 1; // over the + |
| 2534 int n = Base64ScanLen(start, destatep->limit_src); |
| 2535 int nmod8 = n & 7; |
| 2536 if ((n == 3) || (n == 6)) { |
| 2537 // short but legal -- treat as neutral |
| 2538 } else if ((nmod8 == 0) | (nmod8 == 3) | (nmod8 == 6)) { |
| 2539 // Good length. Check for good Unicode. |
| 2540 if (GoodUnicodeFromBase64(start, start + n)) { |
| 2541 // Good length and Unicode, boost |
| 2542 Boost(destatep, F_UTF7, kBoostOnePair); // Found good |
| 2543 destatep->prior_utf7_offset = off + n + 1; |
| 2544 } else { |
| 2545 // Bad Unicode. Whack |
| 2546 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length |
| 2547 } |
| 2548 } else { |
| 2549 // Bad length. Whack |
| 2550 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length |
| 2551 } |
| 2552 } |
| 2553 } |
| 2554 } |
| 2555 |
| 2556 // Boost, whack, or leave alone HZ probablilty |
| 2557 void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { |
| 2558 if ((byte2 == '{') || (byte2 == '}')) { |
| 2559 Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~} |
| 2560 } else if ((byte2 == '~') || (byte2 == '\n')) { |
| 2561 destatep->enc_prob[F_HZ_GB_2312] += 0; // neutral |
| 2562 } else { |
| 2563 Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair |
| 2564 } |
| 2565 } |
| 2566 |
| 2567 // Boost, whack, or leave alone BINARY probablilty |
| 2568 void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { |
| 2569 int quadrant = ((byte1 & 0x80) >> 6) | ((byte2 & 0x80) >> 7); |
| 2570 int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6); |
| 2571 uint32 quad_mask = 1 << quadrant; |
| 2572 uint32 bucket8x4_mask = 1 << bucket8x4; |
| 2573 if ((destatep->binary_quadrants_seen & quad_mask) == 0) { |
| 2574 destatep->binary_quadrants_seen |= quad_mask; |
| 2575 destatep->binary_quadrants_count += 1; |
| 2576 if (destatep->binary_quadrants_count == 4) { |
| 2577 Boost(destatep, F_BINARY, kBoostOnePair * 2); // Found all 4 quadrants, |
| 2578 // boost 2 pairs |
| 2579 } |
| 2580 } |
| 2581 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) { |
| 2582 destatep->binary_8x4_seen |= bucket8x4_mask; |
| 2583 destatep->binary_8x4_count += 1; |
| 2584 if (destatep->binary_8x4_count >= 11) { |
| 2585 Boost(destatep, F_BINARY, kBoostOnePair * 4); // Found 11+/20 buckets, |
| 2586 // boost 4 pairs each time |
| 2587 } |
| 2588 } |
| 2589 } |
| 2590 |
| 2591 |
| 2592 // Demote UTF-16/32 on 0000 or FFFF, favoring Binary |
| 2593 void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) { |
| 2594 if (byte1 == 0) { // We have 0000 |
| 2595 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair |
| 2596 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair |
| 2597 switch (offset & 3) { |
| 2598 case 0: // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE |
| 2599 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair |
| 2600 Boost(destatep, F_UTF_32BE, kSmallInitDiff); // Good pair |
| 2601 break; |
| 2602 case 1: // We get called with 1 5 9, etc. for ASCII as UTF-32LE |
| 2603 case 2: // We get called with 2 6 10, etc. for BMP as UTF-32LE |
| 2604 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair |
| 2605 Boost(destatep, F_UTF_32LE, kSmallInitDiff); // Good pair |
| 2606 break; |
| 2607 case 3: // ambiguous |
| 2608 break; |
| 2609 } |
| 2610 } else { // We have ffff |
| 2611 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair |
| 2612 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair |
| 2613 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair |
| 2614 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair |
| 2615 } |
| 2616 } |
| 2617 |
| 2618 // Make even offset |
| 2619 void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) { |
| 2620 destatep->interesting_offsets[OtherPair][next_pair] &= ~1; |
| 2621 } |
| 2622 |
| 2623 bool ConsecutivePair(DetectEncodingState* destatep, int i) { |
| 2624 if (i <= 0) { |
| 2625 return false; |
| 2626 } |
| 2627 return destatep->interesting_offsets[OtherPair][i] == |
| 2628 (destatep->interesting_offsets[OtherPair][i - 1] + 2); |
| 2629 } |
| 2630 |
| 2631 // boost, whack, or leave alone UTF-8 probablilty |
| 2632 // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8 |
| 2633 // Returns total boost |
| 2634 int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) { |
| 2635 int startcount = destatep->prior_interesting_pair[OtherPair]; |
| 2636 int endcount = destatep->next_interesting_pair[OtherPair]; |
| 2637 |
| 2638 int demotion_count = 0; |
| 2639 for (int i = startcount; i < endcount; ++i) { |
| 2640 int sub; |
| 2641 char* s = &destatep->interesting_pairs[OtherPair][i * 2]; |
| 2642 // Demote four byte patterns that are more likely Latin1 than UTF-8 |
| 2643 // C9AE, DF92, DF93, DFAB. See note at top. |
| 2644 // Demotion also boosts Latin1 and CP1252 |
| 2645 uint8 s0 = static_cast<uint8>(s[0]); |
| 2646 uint8 s1 = static_cast<uint8>(s[1]); |
| 2647 if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;} |
| 2648 if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;} |
| 2649 if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;} |
| 2650 if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;} |
| 2651 |
| 2652 if (!ConsecutivePair(destatep, i)) { |
| 2653 // Insert a blank into the sequence; avoid wrong splices |
| 2654 sub = (' ' >> 4) & 0x0f; |
| 2655 ++destatep->utf8_minicount[ |
| 2656 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_m
inistate)][sub])]; |
| 2657 destatep->next_utf8_ministate = |
| 2658 kMiniUTF8State[destatep->next_utf8_ministate][sub]; |
| 2659 } |
| 2660 // Byte 0 |
| 2661 sub = (s0 >> 4) & 0x0f; |
| 2662 ++destatep->utf8_minicount[ |
| 2663 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_min
istate)][sub])]; |
| 2664 destatep->next_utf8_ministate = |
| 2665 kMiniUTF8State[destatep->next_utf8_ministate][sub]; |
| 2666 // Byte 1 |
| 2667 sub = (s1 >> 4) & 0x0f; |
| 2668 ++destatep->utf8_minicount[ |
| 2669 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_min
istate)][sub])]; |
| 2670 destatep->next_utf8_ministate = |
| 2671 kMiniUTF8State[destatep->next_utf8_ministate][sub]; |
| 2672 DCHECK((0 <= destatep->next_utf8_ministate) && |
| 2673 (destatep->next_utf8_ministate < 8)); |
| 2674 } |
| 2675 |
| 2676 |
| 2677 // For the four specific byte combinations above, Latin1/CP1252 is more likely |
| 2678 if (demotion_count > 0) { |
| 2679 Boost(destatep, F_Latin1, kGentleOnePair * demotion_count); |
| 2680 Boost(destatep, F_CP1252, kGentleOnePair * demotion_count); |
| 2681 } |
| 2682 |
| 2683 // Boost UTF8 for completed good sequences |
| 2684 int total_boost = 2 * destatep->utf8_minicount[2] + |
| 2685 3 * destatep->utf8_minicount[3] + |
| 2686 4 * destatep->utf8_minicount[4]; |
| 2687 // But not so much for demoted bytes |
| 2688 total_boost -= (3 * demotion_count); |
| 2689 |
| 2690 total_boost *= kGentleOnePair; |
| 2691 total_boost >>= weightshift; |
| 2692 // Design: boost both UTF8 and UTF8UTF8 for each good sequence |
| 2693 Boost(destatep, F_UTF8, total_boost); |
| 2694 Boost(destatep, F_UTF8UTF8, total_boost); |
| 2695 |
| 2696 destatep->utf8_minicount[5] += destatep->utf8_minicount[2]; // total chars |
| 2697 destatep->utf8_minicount[5] += destatep->utf8_minicount[3]; // total chars |
| 2698 destatep->utf8_minicount[5] += destatep->utf8_minicount[4]; // total chars |
| 2699 destatep->utf8_minicount[2] = 0; |
| 2700 destatep->utf8_minicount[3] = 0; |
| 2701 destatep->utf8_minicount[4] = 0; |
| 2702 |
| 2703 // Whack (2 bytes) for errors |
| 2704 int error_whack = 2 * destatep->utf8_minicount[1]; |
| 2705 error_whack *= kGentlePairWhack; |
| 2706 error_whack >>= weightshift; |
| 2707 Whack(destatep, F_UTF8, error_whack); |
| 2708 Whack(destatep, F_UTF8UTF8, error_whack); |
| 2709 destatep->utf8_minicount[1] = 0; |
| 2710 |
| 2711 return total_boost - error_whack; |
| 2712 } |
| 2713 |
| 2714 |
| 2715 // Boost, whack, or leave alone UTF8UTF8 probablilty |
| 2716 // |
| 2717 // We are looking for |
| 2718 // (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the |
| 2719 // MS CP1252 mappings, and |
| 2720 // (2) sequences of 2 or more such characters |
| 2721 // |
| 2722 // If so, we could be looking at some non-7-bit encoding extra-converted |
| 2723 // to UTF-8. The most common observed is CP1252->UTF8 twice, |
| 2724 // 1252=>UTF8 : 1252=>UTF8 |
| 2725 // where the colon means "take those bytes and pretend that they are 1252". |
| 2726 // We have a couple of examples of BIG5 bytes converted as though |
| 2727 // they were 1252, |
| 2728 // BIG5 : 1252=>UTF8 |
| 2729 // |
| 2730 // Of course, we don't want correctly converted 1252 to be flagged here |
| 2731 // 1252=>UTF8 |
| 2732 // So we want the input high bytes to be in pairs or longer, hence the |
| 2733 // output UTF8 in groups of four bytes or more |
| 2734 // |
| 2735 // Good chars: C2xx, C3xx, |
| 2736 // Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C |
| 2737 // Good chars: E280xx E282AC E284A2 |
| 2738 // C2xx 1100001x 10xxxxxx (128/128) |
| 2739 // C5xx 11000101 10xx00xx (16/4) |
| 2740 // C5xx 11000101 10111xxx (8/3) |
| 2741 // C692 11000110 10010010 (1/1) |
| 2742 // CBxx 11001011 100xx1x0 (8/2) |
| 2743 // E28x 11100010 10000xx0 (4/3) |
| 2744 // |
| 2745 // Returns total boost |
| 2746 int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) { |
| 2747 int this_pair = destatep->prior_interesting_pair[OtherPair]; |
| 2748 int startbyteoffset = this_pair * 2; |
| 2749 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2; |
| 2750 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset]; |
| 2751 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset]; |
| 2752 |
| 2753 int pair_number = this_pair; |
| 2754 for (char* s = startbyte; s < endbyte; s += 2) { |
| 2755 int next = destatep->next_utf8utf8_ministate; |
| 2756 if (!ConsecutivePair(destatep, pair_number)) { |
| 2757 // Insert two blanks into the sequence to avoid wrong splices |
| 2758 // go back to no odd-byte offset |
| 2759 destatep->utf8utf8_odd_byte = 0; |
| 2760 int sub = UTF88Sub(' ', ' '); |
| 2761 ++destatep->utf8utf8_minicount[static_cast<int>(kMiniUTF8UTF8Count[next][s
ub])]; |
| 2762 next = kMiniUTF8UTF8State[next][sub]; |
| 2763 } |
| 2764 |
| 2765 int odd = destatep->utf8utf8_odd_byte; |
| 2766 if (s + 1 + odd >= endbyte) continue; |
| 2767 int sub = UTF88Sub(s[0 + odd], s[1 + odd]); |
| 2768 destatep->utf8utf8_odd_byte ^= kMiniUTF8UTF8Odd[next][sub]; |
| 2769 ++destatep->utf8utf8_minicount[ |
| 2770 static_cast<int>(kMiniUTF8UTF8Count[next][sub])]; |
| 2771 destatep->next_utf8utf8_ministate = kMiniUTF8UTF8State[next][sub]; |
| 2772 ++pair_number; |
| 2773 } |
| 2774 |
| 2775 // Boost for completed good sequences; each count covers two chars. |
| 2776 // Design: boost UTF8UTF8 above UTF8 for each good sequence |
| 2777 int total_boost = (2) * destatep->utf8utf8_minicount[2] + |
| 2778 (2) * destatep->utf8utf8_minicount[3] + |
| 2779 (2) * destatep->utf8utf8_minicount[4]; |
| 2780 total_boost *= kGentleOnePair; |
| 2781 total_boost >>= weightshift; |
| 2782 Boost(destatep, F_UTF8UTF8, total_boost); |
| 2783 |
| 2784 // Track total characters |
| 2785 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[2]; |
| 2786 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[3]; |
| 2787 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[4]; |
| 2788 destatep->utf8utf8_minicount[2] = 0; |
| 2789 destatep->utf8utf8_minicount[3] = 0; |
| 2790 destatep->utf8utf8_minicount[4] = 0; |
| 2791 |
| 2792 // Design: Do not whack UTF8UTF8 below UTF8 for each bad sequence |
| 2793 |
| 2794 destatep->utf8utf8_minicount[1] = 0; |
| 2795 return total_boost; |
| 2796 } |
| 2797 |
| 2798 |
| 2799 // boost, whack, or leave alone UTF-32 probablilty |
| 2800 // Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE) |
| 2801 // Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE) |
| 2802 void CheckUTF32ActiveSeq(DetectEncodingState* destatep) { |
| 2803 // Not needed |
| 2804 return; |
| 2805 } |
| 2806 |
| 2807 // We give a gentle boost for each paired SO ... SI, whack others |
| 2808 void CheckIso2022ActiveSeq(DetectEncodingState* destatep) { |
| 2809 int this_pair = destatep->prior_interesting_pair[OtherPair]; |
| 2810 int startbyteoffset = this_pair * 2; |
| 2811 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2; |
| 2812 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset]; |
| 2813 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset]; |
| 2814 |
| 2815 // Initial <esc> char must precede SO/SI |
| 2816 // HZ_GB_2312 has no alternation constraint on 1- and 2-byte segments |
| 2817 // ISO-2022-JP (JIS) has no alternation constraint on 1- and 2-byte segments |
| 2818 // ISO-2022-CN has no alternation constraint on 1- and 2-byte segments |
| 2819 // ISO-2022-KR requires alternation between 1- and 2-byte segments |
| 2820 // JIS: |
| 2821 // <esc> ( B ISO-2022-JP [1b 28 42] SI to ASCII |
| 2822 // <esc> ( J ISO-2022-JP [1b 28 4a] SI to X0201 |
| 2823 // <esc> $ @ ISO-2022-JP [1b 24 40] SO to X0208-78 twobyte |
| 2824 // <esc> $ B ISO-2022-JP [1b 24 42] SO to X0208-83 twobyte |
| 2825 for (char* s = startbyte; s < endbyte; s += 2) { |
| 2826 if (s[0] == 0x1b) { |
| 2827 if (s[1] == 0x24) { |
| 2828 // <esc> $ is SO |
| 2829 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte |
| 2830 } else if (s[1] == 0x28) { |
| 2831 if (destatep->next_2022_state == SOSI_TWOBYTE) { |
| 2832 Boost(destatep, F_JIS, kGentlePairBoost); |
| 2833 } else if (destatep->next_2022_state == SOSI_ONEBYTE) { |
| 2834 Whack(destatep, F_JIS, kGentlePairWhack); |
| 2835 } |
| 2836 destatep->next_2022_state = SOSI_ONEBYTE; // JIS SI to one-byte |
| 2837 } else { |
| 2838 Whack(destatep, F_JIS, kBadPairWhack); |
| 2839 Whack(destatep, F_ISO_2022_CN, kBadPairWhack); |
| 2840 Whack(destatep, F_ISO_2022_KR, kBadPairWhack); |
| 2841 destatep->next_2022_state = SOSI_ERROR; // not 2022 |
| 2842 } |
| 2843 } else if (s[0] == 0x0e) { |
| 2844 // <so> |
| 2845 Whack(destatep, F_JIS, kBadPairWhack); |
| 2846 if (destatep->next_2022_state != SOSI_NONE) { |
| 2847 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte |
| 2848 } else { |
| 2849 // ESC required before SO/SI |
| 2850 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4); |
| 2851 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4); |
| 2852 destatep->next_2022_state = SOSI_ERROR; // SO not after SI |
| 2853 } |
| 2854 } else if (s[0] == 0x0f) { |
| 2855 // <si> |
| 2856 Whack(destatep, F_JIS, kBadPairWhack); |
| 2857 if (destatep->next_2022_state != SOSI_NONE) { |
| 2858 if (destatep->next_2022_state == SOSI_TWOBYTE) { |
| 2859 Boost(destatep, F_ISO_2022_CN, kGentlePairBoost); |
| 2860 Boost(destatep, F_ISO_2022_KR, kGentlePairBoost); |
| 2861 } else if (destatep->next_2022_state == SOSI_ONEBYTE) { |
| 2862 Whack(destatep, F_ISO_2022_CN, kGentlePairWhack); |
| 2863 Whack(destatep, F_ISO_2022_KR, kGentlePairWhack); |
| 2864 } |
| 2865 destatep->next_2022_state = SOSI_ONEBYTE; // SI to one-byte |
| 2866 } else { |
| 2867 // ESC required before SO/SI |
| 2868 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4); |
| 2869 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4); |
| 2870 destatep->next_2022_state = SOSI_ERROR; // SI not after SO |
| 2871 } |
| 2872 } else if (s[0] <= 0x1f) { |
| 2873 // Some other control code. Allow ht lf [ff] cr |
| 2874 if ((s[0] != 0x09) && (s[0] != 0x0a) && |
| 2875 (s[0] != 0x0c) && (s[0] != 0x0d)) { |
| 2876 // Otherwise these can float to the top on bad bytes |
| 2877 Whack(destatep, F_JIS, kBadPairWhack); |
| 2878 Whack(destatep, F_ISO_2022_CN, kBadPairWhack); |
| 2879 Whack(destatep, F_ISO_2022_KR, kBadPairWhack); |
| 2880 } |
| 2881 } |
| 2882 } |
| 2883 |
| 2884 // If no start, keep the probability pinned at zero (or below) |
| 2885 if (destatep->next_2022_state == SOSI_NONE) { |
| 2886 destatep->enc_prob[F_ISO_2022_CN] = |
| 2887 minint(0, destatep->enc_prob[F_ISO_2022_CN]); |
| 2888 destatep->enc_prob[F_ISO_2022_KR] = |
| 2889 minint(0, destatep->enc_prob[F_ISO_2022_KR]); |
| 2890 destatep->enc_prob[F_JIS] = |
| 2891 minint(0, destatep->enc_prob[F_JIS]); |
| 2892 } |
| 2893 } |
| 2894 |
| 2895 // We give a gentle boost for each paired ~{ ... ~}, whack others |
| 2896 void CheckHzActiveSeq(DetectEncodingState* destatep) { |
| 2897 int this_pair = destatep->prior_interesting_pair[AsciiPair]; |
| 2898 int startbyteoffset = this_pair * 2; |
| 2899 int endbyteoffset = destatep->next_interesting_pair[AsciiPair] * 2; |
| 2900 char* startbyte = &destatep->interesting_pairs[AsciiPair][startbyteoffset]; |
| 2901 char* endbyte = &destatep->interesting_pairs[AsciiPair][endbyteoffset]; |
| 2902 |
| 2903 for (char* s = startbyte; s < endbyte; s += 2) { |
| 2904 // Look for initial ~{ pair |
| 2905 if ((s[0] == '~') && (s[1] == '{')) { |
| 2906 destatep->next_hz_state = SOSI_TWOBYTE; // SO to two-byte |
| 2907 } |
| 2908 // Also look for closing ~} pair |
| 2909 if ((s[0] == '~') && (s[1] == '}')) { |
| 2910 if (destatep->next_hz_state == SOSI_TWOBYTE) { |
| 2911 Boost(destatep, F_HZ_GB_2312, kGentlePairBoost); |
| 2912 } else if (destatep->next_hz_state == SOSI_ONEBYTE) { |
| 2913 Whack(destatep, F_HZ_GB_2312, kGentlePairWhack); |
| 2914 } |
| 2915 destatep->next_hz_state = SOSI_ONEBYTE; // SI to one-byte |
| 2916 } |
| 2917 } |
| 2918 |
| 2919 // If no start, keep the probability pinned at zero (or below) |
| 2920 if (destatep->next_hz_state == SOSI_NONE) { |
| 2921 destatep->enc_prob[F_HZ_GB_2312] = |
| 2922 minint(0, destatep->enc_prob[F_HZ_GB_2312]); |
| 2923 } |
| 2924 } |
| 2925 |
| 2926 // We give a gentle boost after an odd number of 8Fxxxx triples, which |
| 2927 // put subsequent bigrams out of phase until a low byte or another 8Fxxxx |
| 2928 void CheckEucJpSeq(DetectEncodingState* destatep) { |
| 2929 int this_pair = destatep->prior_interesting_pair[OtherPair]; |
| 2930 int startbyteoffset = this_pair * 2; |
| 2931 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2; |
| 2932 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset]; |
| 2933 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset]; |
| 2934 |
| 2935 for (char* s = startbyte; s < endbyte; s += 2) { |
| 2936 // Boost if out of phase (otherwise, EUC-JP will score badly after 8Fxxxx) |
| 2937 if (destatep->next_eucjp_oddphase) { |
| 2938 //printf(" EucJp boost[%02x%02x]\n", s[0], s[1]); // TEMP |
| 2939 Boost(destatep, F_EUC_JP, kGentlePairBoost * 2); |
| 2940 } |
| 2941 |
| 2942 uint8 s0 = static_cast<uint8>(s[0]); |
| 2943 uint8 s1 = static_cast<uint8>(s[1]); |
| 2944 // Look for phase flip at 8F |
| 2945 if ((s0 & 0x80) == 0x00) { |
| 2946 destatep->next_eucjp_oddphase = false; |
| 2947 } else if (s0 == 0x8f) { |
| 2948 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase; |
| 2949 } |
| 2950 if ((s1 & 0x80) == 0x00) { |
| 2951 destatep->next_eucjp_oddphase = false; |
| 2952 } else if (s1 == 0x8f) { |
| 2953 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase; |
| 2954 } |
| 2955 } |
| 2956 } |
| 2957 |
| 2958 // Boost, whack, or leave alone BINARY probablilty |
| 2959 // Also called if UTF 16/32 active |
| 2960 void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep, |
| 2961 int delta_otherpairs) { |
| 2962 // No change if not much gathered information |
| 2963 if (delta_otherpairs == 0) { |
| 2964 // Only ASCII pairs this call |
| 2965 return; |
| 2966 } |
| 2967 int next_pair = destatep->next_interesting_pair[OtherPair]; |
| 2968 |
| 2969 // Look at density of interesting pairs [0..src) |
| 2970 int delta_offset = static_cast<int>(src - destatep->initial_src); // actual |
| 2971 |
| 2972 // Look at density of interesting pairs [0..next_interesting) |
| 2973 int low_byte = destatep->interesting_offsets[OtherPair][0]; |
| 2974 //int high_byte = destatep->interesting_offsets[OtherPair][next_pair - 1] + 2; |
| 2975 //int byte_span = high_byte - low_byte; |
| 2976 int byte_span = delta_offset - low_byte; |
| 2977 |
| 2978 // If all ASCII for the first 4KB, reject |
| 2979 // If mostly ASCII in the first 5KB, reject |
| 2980 if ((low_byte >= kBinaryHardAsciiLimit) || (delta_offset >= kBinarySoftAsciiLi
mit)) { |
| 2981 // Not binary early enough in text |
| 2982 Whack(destatep, F_BINARY, kBadPairWhack * 4); |
| 2983 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4); |
| 2984 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4); |
| 2985 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4); |
| 2986 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4); |
| 2987 return; |
| 2988 } |
| 2989 |
| 2990 // Density 1.0 for N pairs takes 2*N bytes |
| 2991 // Whack if < 1/16 after first non_ASCII pair |
| 2992 if ((next_pair * 2 * 16) < byte_span) { |
| 2993 // Not dense enough |
| 2994 Whack(destatep, F_BINARY, kBadPairWhack * 4); |
| 2995 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4); |
| 2996 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4); |
| 2997 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4); |
| 2998 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4); |
| 2999 } |
| 3000 |
| 3001 if (next_pair < 8) { |
| 3002 // Fewer than 8 non-ASCII total; too soon to boost |
| 3003 return; |
| 3004 } |
| 3005 |
| 3006 // Density 1.0 for N pairs takes 2*N bytes |
| 3007 // Boost if density >= 1/4, whack if < 1/16 |
| 3008 if ((next_pair * 2 * 4) >= byte_span) { |
| 3009 // Very dense |
| 3010 // Only boost if at least 2 quadrants seen |
| 3011 if (destatep->binary_quadrants_count >= 2) { |
| 3012 Boost(destatep, F_BINARY, kSmallInitDiff); |
| 3013 Boost(destatep, F_UTF_32BE, kSmallInitDiff); |
| 3014 Boost(destatep, F_UTF_32LE, kSmallInitDiff); |
| 3015 Boost(destatep, F_UTF_16BE, kSmallInitDiff); |
| 3016 Boost(destatep, F_UTF_16LE, kSmallInitDiff); |
| 3017 } |
| 3018 } |
| 3019 } |
| 3020 |
| 3021 |
| 3022 // Look at a number of special-case encodings whose reliable detection depends |
| 3023 // on sequencing or other properties |
| 3024 // AsciiPair probibilities (UTF7 and HZ) are all done here |
| 3025 void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) { |
| 3026 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] - |
| 3027 destatep->prior_interesting_pair[AsciiPair]; |
| 3028 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] - |
| 3029 destatep->prior_interesting_pair[OtherPair]; |
| 3030 |
| 3031 // The two pure ASCII encodings |
| 3032 if (UTF7OrHzActive(destatep) && (delta_asciipairs > 0)) { |
| 3033 // Adjust per pair |
| 3034 for (int i = 0; i < delta_asciipairs; ++i) { |
| 3035 int next_pair = destatep->prior_interesting_pair[AsciiPair] + i; |
| 3036 uint8 byte1 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 0]; |
| 3037 uint8 byte2 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 1]; |
| 3038 if (byte1 == '+') { |
| 3039 // Boost, whack, or leave alone UTF-7 probablilty |
| 3040 UTF7BoostWhack(destatep, next_pair, byte2); |
| 3041 if (destatep->debug_data != NULL) { |
| 3042 // Show UTF7 entry |
| 3043 char buff[16]; |
| 3044 snprintf(buff, sizeof(buff), "%02x%02x+", byte1, byte2); |
| 3045 SetDetailsEncProb(destatep, |
| 3046 destatep->interesting_offsets[AsciiPair][next_pair], |
| 3047 kMostLikelyEncoding[(byte1 << 8) + byte2], |
| 3048 buff); |
| 3049 } |
| 3050 } else if (byte1 == '~') { |
| 3051 // Boost, whack, or leave alone HZ probablilty |
| 3052 HzBoostWhack(destatep, byte1, byte2); |
| 3053 if (destatep->debug_data != NULL) { |
| 3054 // Show Hz entry |
| 3055 char buff[16]; |
| 3056 snprintf(buff, sizeof(buff), "%02x%02x~", byte1, byte2); |
| 3057 SetDetailsEncProb(destatep, |
| 3058 destatep->interesting_offsets[AsciiPair][next_pair], |
| 3059 kMostLikelyEncoding[(byte1 << 8) + byte2], |
| 3060 buff); |
| 3061 } |
| 3062 } |
| 3063 } |
| 3064 |
| 3065 // Kill UTF-7 now if at least 8 + pairs and not confirmed valid UTF-7 |
| 3066 if ((destatep->utf7_starts >= 8) && (destatep->prior_utf7_offset == 0)) { |
| 3067 Whack(destatep, F_UTF7, kBadPairWhack * 8); // flush |
| 3068 } |
| 3069 } |
| 3070 |
| 3071 |
| 3072 |
| 3073 // All the other encodings |
| 3074 if (OtherActive(destatep) && (delta_otherpairs > 0)) { |
| 3075 // Adjust per pair |
| 3076 int biggest_weightshift = 0; |
| 3077 for (int i = 0; i < delta_otherpairs; ++i) { |
| 3078 int next_pair = destatep->prior_interesting_pair[OtherPair] + i; |
| 3079 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0]; |
| 3080 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1]; |
| 3081 int off = destatep->interesting_offsets[OtherPair][next_pair]; |
| 3082 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair]; |
| 3083 biggest_weightshift = maxint(biggest_weightshift, weightshift); |
| 3084 |
| 3085 if (byte1 == 0x00) { |
| 3086 if (byte2 == 0x00) { |
| 3087 UTF1632BoostWhack(destatep, off, byte1); |
| 3088 } else if ((kIsPrintableAscii[byte2] != 0) && ((off & 1) != 0)) { |
| 3089 // We have 00xx at an odd offset. Turn into preceding even offset |
| 3090 // for possible Ascii text in UTF-16LE or UTF-32LE (vs BE) |
| 3091 // This will cascade into caller's probability update |
| 3092 // 00 is illegal for all other encodings, so it doesn't matter to them |
| 3093 UTF16MakeEven(destatep, next_pair); |
| 3094 } |
| 3095 if (destatep->debug_data != NULL) { |
| 3096 // Show 0000 detail entry for this bigram |
| 3097 char buff[16]; |
| 3098 snprintf(buff, sizeof(buff), "%02x%02xZ", byte1, byte2); |
| 3099 SetDetailsEncProb(destatep, |
| 3100 destatep->interesting_offsets[OtherPair][next_pair], |
| 3101 kMostLikelyEncoding[(byte1 << 8) + byte2], |
| 3102 buff); |
| 3103 } |
| 3104 } |
| 3105 if (byte1 == 0xff) { |
| 3106 if (byte2 == 0xff) { |
| 3107 UTF1632BoostWhack(destatep, off, byte1); |
| 3108 } |
| 3109 if (destatep->debug_data != NULL) { |
| 3110 // Show FFFF detail entry for this bigram |
| 3111 char buff[16]; |
| 3112 snprintf(buff, sizeof(buff), "%02x%02xF", byte1, byte2); |
| 3113 SetDetailsEncProb(destatep, |
| 3114 destatep->interesting_offsets[OtherPair][next_pair], |
| 3115 kMostLikelyEncoding[(byte1 << 8) + byte2], |
| 3116 buff); |
| 3117 } |
| 3118 } |
| 3119 if (BinaryActive(destatep)) { |
| 3120 BinaryBoostWhack(destatep, byte1, byte2); |
| 3121 } |
| 3122 } // End for i |
| 3123 |
| 3124 // Adjust per entire-pair-span |
| 3125 int utf8_boost = 0; |
| 3126 int utf8utf8_boost = 0; |
| 3127 if (UTF8Active(destatep)) { |
| 3128 utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift); |
| 3129 } |
| 3130 |
| 3131 if (UTF8UTF8Active(destatep)) { |
| 3132 utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift); |
| 3133 } |
| 3134 |
| 3135 if (UTF1632Active(destatep)) { |
| 3136 CheckUTF32ActiveSeq(destatep); |
| 3137 } |
| 3138 |
| 3139 if (Iso2022Active(destatep)) { |
| 3140 CheckIso2022ActiveSeq(destatep); |
| 3141 } |
| 3142 |
| 3143 if (HzActive(destatep)) { |
| 3144 CheckHzActiveSeq(destatep); |
| 3145 } |
| 3146 |
| 3147 if (EUCJPActive(destatep)) { |
| 3148 CheckEucJpSeq(destatep); |
| 3149 } |
| 3150 |
| 3151 if (BinaryActive(destatep) || UTF1632Active(destatep)) { |
| 3152 CheckBinaryDensity(src, destatep, delta_otherpairs); |
| 3153 } |
| 3154 } |
| 3155 // ISO-2022 do OK on their own, using stright probabilities? Not on bad bytes |
| 3156 |
| 3157 if (destatep->debug_data != NULL) { |
| 3158 // Show sequencing result |
| 3159 SetDetailsEncLabel(destatep, "seq"); |
| 3160 } |
| 3161 } |
| 3162 |
| 3163 |
| 3164 void PrintTopEnc(DetectEncodingState* destatep, int n) { |
| 3165 // Print top n or fewer |
| 3166 int temp_sort[NUM_RANKEDENCODING]; |
| 3167 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) { |
| 3168 int rankedencoding = destatep->rankedencoding_list[j]; |
| 3169 temp_sort[j] = destatep->enc_prob[rankedencoding]; |
| 3170 } |
| 3171 |
| 3172 qsort(temp_sort, destatep->rankedencoding_list_len, |
| 3173 sizeof(temp_sort[0]), IntCompare); |
| 3174 |
| 3175 int top_n = minint(n, destatep->rankedencoding_list_len); |
| 3176 int showme = temp_sort[top_n - 1]; // Print this value and above |
| 3177 |
| 3178 printf("rankedencodingList top %d: ", top_n); |
| 3179 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) { |
| 3180 int rankedencoding = destatep->rankedencoding_list[j]; |
| 3181 if (showme <= destatep->enc_prob[rankedencoding]) { |
| 3182 printf("%s=%d ", |
| 3183 MyEncodingName(kMapToEncoding[rankedencoding]), |
| 3184 destatep->enc_prob[rankedencoding]); |
| 3185 } |
| 3186 } |
| 3187 printf("\n\n"); |
| 3188 } |
| 3189 |
| 3190 // If the same bigram repeats, don't boost its best encoding too much |
| 3191 bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { |
| 3192 int this_bigram = (byte1 << 8) | byte2; |
| 3193 // If 00xx 01xx 02xx ... 1fxx, take out bottom 4 bits of xx. |
| 3194 // This ignores parts of Yahoo 0255 0254 0243 0247 0245 0243 0250 0255 ... |
| 3195 // It may screw up UTF-16BE |
| 3196 // It may screw up ISO-2022 (1b24 suppresses 1b28) |
| 3197 if (byte1 < 0x20) { |
| 3198 this_bigram &= 0xfff0; |
| 3199 } |
| 3200 if (this_bigram == destatep->prior_bigram[0]) {return true;} |
| 3201 if (this_bigram == destatep->prior_bigram[1]) {return true;} |
| 3202 if (this_bigram == destatep->prior_bigram[2]) {return true;} |
| 3203 if (this_bigram == destatep->prior_bigram[3]) {return true;} |
| 3204 // Round-robin replacement |
| 3205 destatep->prior_bigram[destatep->next_prior_bigram] = this_bigram; |
| 3206 destatep->next_prior_bigram = (destatep->next_prior_bigram + 1) & 3; |
| 3207 return false; |
| 3208 } |
| 3209 |
| 3210 // Sometimes illegal bytes are used as markers between text that Javascript |
| 3211 // is going to decode. Don't overboost the Binary encoding for markers 01-FF. |
| 3212 // Just count first pair per 8x4 bucket |
| 3213 bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { |
| 3214 int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6); |
| 3215 uint32 bucket8x4_mask = 1 << bucket8x4; |
| 3216 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) { |
| 3217 destatep->binary_8x4_seen |= bucket8x4_mask; |
| 3218 destatep->binary_8x4_count += 1; |
| 3219 return false; |
| 3220 } |
| 3221 return true; |
| 3222 } |
| 3223 |
| 3224 |
| 3225 |
| 3226 |
| 3227 // Find current top two rankedencoding probabilities |
| 3228 void ReRank(DetectEncodingState* destatep) { |
| 3229 destatep->top_prob = -1; |
| 3230 destatep->second_top_prob = -1; |
| 3231 // Leave unchanged |
| 3232 //destatep->top_rankedencoding = |
| 3233 // destatep->rankedencoding_list[0]; // Just to make well-defined |
| 3234 //destatep->second_top_rankedencoding = |
| 3235 // destatep->rankedencoding_list[1]; // Just to make well-defined |
| 3236 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 3237 int rankedencoding = destatep->rankedencoding_list[j]; |
| 3238 if (destatep->top_prob < destatep->enc_prob[rankedencoding]) { |
| 3239 // Make sure top 2 are in different superset groups |
| 3240 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] != |
| 3241 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) { |
| 3242 destatep->second_top_prob = |
| 3243 destatep->top_prob; // old top to second |
| 3244 destatep->second_top_rankedencoding = |
| 3245 destatep->top_rankedencoding; // old top to second |
| 3246 } |
| 3247 destatep->top_prob = destatep->enc_prob[rankedencoding]; |
| 3248 destatep->top_rankedencoding = rankedencoding; |
| 3249 } else if (destatep->second_top_prob < destatep->enc_prob[rankedencoding]) { |
| 3250 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] != |
| 3251 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) { |
| 3252 destatep->second_top_prob = destatep->enc_prob[rankedencoding]; |
| 3253 destatep->second_top_rankedencoding = rankedencoding; |
| 3254 } |
| 3255 } |
| 3256 } |
| 3257 } |
| 3258 |
| 3259 void SimplePrune(DetectEncodingState* destatep, int prune_diff) { |
| 3260 // Prune the list of active encoding families |
| 3261 int keep_prob = destatep->top_prob - prune_diff; |
| 3262 |
| 3263 destatep->active_special = 0; |
| 3264 int k = 0; |
| 3265 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 3266 bool keep = true; |
| 3267 int rankedencoding = destatep->rankedencoding_list[j]; |
| 3268 |
| 3269 // If count is too low, ditch it |
| 3270 if (destatep->enc_prob[rankedencoding] < keep_prob) {keep = false;} |
| 3271 |
| 3272 // Keep it. This will always keep at least top_prob rankedencoding |
| 3273 if (keep) { |
| 3274 destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]]; |
| 3275 destatep->rankedencoding_list[k++] = rankedencoding; |
| 3276 } |
| 3277 } |
| 3278 |
| 3279 destatep->rankedencoding_list_len = k; |
| 3280 } |
| 3281 |
| 3282 // Recalculate reliable |
| 3283 void CalcReliable(DetectEncodingState* destatep) { |
| 3284 // Encoding result is reliable if big difference in top two, or if |
| 3285 // only Ascii7 ever encountered |
| 3286 // Also reliable if exactly one OtherPair and it's best encoding matches top |
| 3287 destatep->reliable = false; |
| 3288 if (destatep->next_interesting_pair[OtherPair] == 0) { |
| 3289 // Only 7-bit ASCII |
| 3290 destatep->reliable = true; |
| 3291 return; |
| 3292 } |
| 3293 if ((destatep->top_prob - destatep->second_top_prob) >= |
| 3294 FLAGS_ced_reliable_difference) { |
| 3295 destatep->reliable = true; |
| 3296 return; |
| 3297 } |
| 3298 if (destatep->next_interesting_pair[OtherPair] == 1) { |
| 3299 uint8 byte1 = destatep->interesting_pairs[OtherPair][0]; |
| 3300 uint8 byte2 = destatep->interesting_pairs[OtherPair][1]; |
| 3301 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2]; |
| 3302 if (best_enc == destatep->top_rankedencoding) { |
| 3303 destatep->reliable = true; |
| 3304 return; |
| 3305 } |
| 3306 } |
| 3307 |
| 3308 // If we pruned to one encoding, we are done |
| 3309 if (destatep->rankedencoding_list_len == 1) { |
| 3310 destatep->reliable = true; |
| 3311 destatep->done = true; |
| 3312 return; |
| 3313 } |
| 3314 |
| 3315 // If we pruned to two or three encodings in the same *superset/subset |
| 3316 // rankedencoding* and enough pairs, we are done. Else keep going |
| 3317 if (destatep->rankedencoding_list_len == 2) { |
| 3318 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]]; |
| 3319 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]]; |
| 3320 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) { |
| 3321 if (destatep->prune_count >= 3) { |
| 3322 destatep->reliable = true; |
| 3323 destatep->done = true; |
| 3324 return; |
| 3325 } |
| 3326 } |
| 3327 } else if (destatep->rankedencoding_list_len == 3) { |
| 3328 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]]; |
| 3329 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]]; |
| 3330 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]]; |
| 3331 Encoding base0 = kMapEncToBaseEncoding[enc0]; |
| 3332 Encoding base1 = kMapEncToBaseEncoding[enc1]; |
| 3333 Encoding base2 = kMapEncToBaseEncoding[enc2]; |
| 3334 |
| 3335 if ((base0 == base1) && (base0 == base2)) { |
| 3336 if (destatep->prune_count >= 3) { |
| 3337 destatep->reliable = true; |
| 3338 destatep->done = true; |
| 3339 return; |
| 3340 } |
| 3341 } |
| 3342 } |
| 3343 |
| 3344 } |
| 3345 |
| 3346 |
| 3347 // Find current top two rankedencoding probabilities |
| 3348 void FindTop2(DetectEncodingState* destatep, |
| 3349 int* first_renc, int* second_renc, |
| 3350 int* first_prob, int* second_prob) { |
| 3351 *first_prob = -1; |
| 3352 *second_prob = -1; |
| 3353 *first_renc = 0; |
| 3354 *second_renc = 0; |
| 3355 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 3356 int rankedencoding = destatep->rankedencoding_list[j]; |
| 3357 if (*first_prob < destatep->enc_prob[rankedencoding]) { |
| 3358 *second_prob = *first_prob; // old top to second |
| 3359 *second_renc = *first_renc; // old top to second |
| 3360 *first_prob = destatep->enc_prob[rankedencoding]; |
| 3361 *first_renc = rankedencoding; |
| 3362 } else if (*second_prob < destatep->enc_prob[rankedencoding]) { |
| 3363 *second_prob = destatep->enc_prob[rankedencoding]; |
| 3364 *second_renc = rankedencoding; |
| 3365 } |
| 3366 } |
| 3367 } |
| 3368 |
| 3369 |
| 3370 void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) { |
| 3371 printf("Current ranked encoding list %s\n", str); |
| 3372 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 3373 int rankedencoding = destatep->rankedencoding_list[j]; |
| 3374 if ((rankedencoding < 0) || (rankedencoding > NUM_RANKEDENCODING)) { |
| 3375 printf(" [%d] BOGUS rankedencoding = %d\n", j, rankedencoding); |
| 3376 } else { |
| 3377 printf(" [%d] rankedencoding = %d %-12.12s enc_prob = %d\n", |
| 3378 j, rankedencoding, MyRankedEncName(rankedencoding), |
| 3379 destatep->enc_prob[rankedencoding]); |
| 3380 } |
| 3381 } |
| 3382 printf("End current ranked encoding list\n\n"); |
| 3383 } |
| 3384 |
| 3385 |
| 3386 |
| 3387 |
| 3388 // Map unencoded bytes down to five bits, largely preserving letters |
| 3389 // This design struggles to put 33 values into 5 bits. |
| 3390 #define XX 0 // Punctuation (00-7F range) |
| 3391 #define HA 27 // High vowel a in Latin1/2/sometimes7 |
| 3392 #define HE 28 // High vowel e |
| 3393 #define HI 29 // High vowel i |
| 3394 #define HO 30 // High vowel o |
| 3395 #define HU 30 // High vowel u on top of HO |
| 3396 #define Hc 31 // High consonant (80-FF range) |
| 3397 static const char kMapToFiveBits[256] = { |
| 3398 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX, |
| 3399 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX, |
| 3400 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX, |
| 3401 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX, |
| 3402 |
| 3403 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, |
| 3404 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX, |
| 3405 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, |
| 3406 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX, |
| 3407 |
| 3408 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc, |
| 3409 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc, |
| 3410 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc, |
| 3411 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc, |
| 3412 |
| 3413 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc, |
| 3414 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc, |
| 3415 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc, |
| 3416 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc, |
| 3417 |
| 3418 }; |
| 3419 #undef XX |
| 3420 #undef HA |
| 3421 #undef HE |
| 3422 #undef HI |
| 3423 #undef HO |
| 3424 #undef HU |
| 3425 #undef Hc |
| 3426 |
| 3427 static const int kTriNoneLikely = 0; |
| 3428 static const int kTriLatin1Likely = 1; |
| 3429 static const int kTriLatin2Likely = 2; |
| 3430 static const int kTriLatin7Likely = 3; |
| 3431 |
| 3432 // Each table entry has 32 times two bits, selected by byte[2] |
| 3433 // Entry subscript is selected by byte[0] and byte[1] |
| 3434 // Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc |
| 3435 static const uint64 kLatin127Trigrams[1024] = { |
| 3436 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3437 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3438 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3439 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3440 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3441 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3442 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3443 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000
0000000ULL, |
| 3444 0x0000000000000000ULL, 0x304080c0402c3330ULL, 0x0008400004000000ULL, 0x082800000
c200000ULL, |
| 3445 0x23a0000420800030ULL, 0x00000000000ccc00ULL, 0x0500100100100000ULL, 0x038840000
0200010ULL, |
| 3446 0x0000000000000c00ULL, 0xd0f0300740f0cf00ULL, 0x2aa0a2a22882a2acULL, 0x081d80000
0000080ULL, |
| 3447 0x0c82000020000000ULL, 0x200a03c000a00000ULL, 0x0008400400290000ULL, 0x040087000
0000000ULL, |
| 3448 0x00f040c00000c080ULL, 0x0008004000000410ULL, 0x0020300000000030ULL, 0x00a030002
c300000ULL, |
| 3449 0x0c8030c020a00000ULL, 0x15410030f0f4c000ULL, 0x3000000300a00000ULL, 0xa2880980a
0880a88ULL, |
| 3450 0x0900300000000000ULL, 0x0000040100300000ULL, 0x0888820020a00000ULL, 0xc04400224
2010000ULL, |
| 3451 0x000000121d300040ULL, 0x40100040440c0d54ULL, 0x00008423102f8144ULL, 0x0b4080840
0000280ULL, |
| 3452 0x0000000000000000ULL, 0x0680a000000c0000ULL, 0x0880008020aa0000ULL, 0x2aaa01410
10a4940ULL, |
| 3453 0xcb80000000010000ULL, 0x2280000000000000ULL, 0x5248000001800000ULL, 0x800040100
4040010ULL, |
| 3454 0x1540010201001010ULL, 0x0080080400000000ULL, 0x5a00044040000108ULL, 0x028800028
2080008ULL, |
| 3455 0x4800008002200000ULL, 0x4a00000000010100ULL, 0x8a88040080000800ULL, 0x014080000
0000400ULL, |
| 3456 0x40010050000c0000ULL, 0x0000008000000000ULL, 0x0028000020140040ULL, 0x862040140
1005308ULL, |
| 3457 0xc082000000000400ULL, 0x05c0b004c0240600ULL, 0x0288000080000000ULL, 0x000001400
0000000ULL, |
| 3458 0x00000000040000c0ULL, 0x8001861008004280ULL, 0x0200000000000300ULL, 0x000024024
2288620ULL, |
| 3459 0x801000c05434c200ULL, 0x9020162040a2d2b4ULL, 0x0021840000240704ULL, 0x2a8028008
0084908ULL, |
| 3460 0x0000000000000000ULL, 0x0500004000000040ULL, 0x0080000000040000ULL, 0x010805810
4440000ULL, |
| 3461 0x0900000000040000ULL, 0x00c0000000208008ULL, 0x2000005000000000ULL, 0x008000000
0050000ULL, |
| 3462 0x0808000000001080ULL, 0x9880810100308000ULL, 0x2285480080081a08ULL, 0x8a8000008
0080000ULL, |
| 3463 0x1450000000600010ULL, 0x2210000100000000ULL, 0x8a88000100011000ULL, 0x154180400
0000010ULL, |
| 3464 0xc084011140040100ULL, 0x0000000000000800ULL, 0x0400000000000030ULL, 0x2a800000a
0890128ULL, |
| 3465 0x1140a00054000104ULL, 0x1440000101200404ULL, 0x028800400400d800ULL, 0x000000000
0000000ULL, |
| 3466 0x0000000000002330ULL, 0x0020820228a02280ULL, 0xa2888a02aa8008a8ULL, 0xd0040a004
4202500ULL, |
| 3467 0x8000044104a29424ULL, 0xc000100178b2c5b4ULL, 0x0000810100241504ULL, 0xd04003000
0380008ULL, |
| 3468 0x0000000000000000ULL, 0x26c08c0000200130ULL, 0x4a08000110080000ULL, 0x2aa000400
1080800ULL, |
| 3469 0x0aac000000004000ULL, 0x2000000000200000ULL, 0x4240000100020000ULL, 0x410000008
0000000ULL, |
| 3470 0x4900040000000000ULL, 0x0800000400300040ULL, 0x6a80000000040800ULL, 0x2a0818200
0588008ULL, |
| 3471 0x0a00000c81000008ULL, 0x0a000c0010000000ULL, 0x8a88001080280808ULL, 0x002000020
0300600ULL, |
| 3472 0xaac00000900a0000ULL, 0x0000100004000000ULL, 0x0020081020000000ULL, 0x822010501
0084110ULL, |
| 3473 0x4a80800000004000ULL, 0x050000c0c0200000ULL, 0x288c000084000000ULL, 0xa04808228
0000000ULL, |
| 3474 0x0000000000000000ULL, 0x8000900000032080ULL, 0xee889e81b8880820ULL, 0xc2200a814
2800424ULL, |
| 3475 0xc020141543361010ULL, 0x10a000204a801634ULL, 0x3a808800802a00a0ULL, 0x28808b008
03d0800ULL, |
| 3476 0x0000000000000000ULL, 0x0020000000000030ULL, 0x0808400121010040ULL, 0x0c2824010
0200040ULL, |
| 3477 0x2008200028800000ULL, 0xc10004c80f30c030ULL, 0x0400440114100000ULL, 0x220820028
0a22220ULL, |
| 3478 0x0600000030c01000ULL, 0x1201001040c00000ULL, 0x0aa02ea22aa22aa0ULL, 0x300080000
00200a0ULL, |
| 3479 0x20c8400400800000ULL, 0x08280b0420800000ULL, 0x0800100000210000ULL, 0x10000300c
0100400ULL, |
| 3480 0xc8c0000420000000ULL, 0x1000000010000000ULL, 0x0420000400000000ULL, 0x022000050
0204000ULL, |
| 3481 0x2200000420000000ULL, 0x0000540400000000ULL, 0x0000000020000000ULL, 0x00080c00a
0810080ULL, |
| 3482 0x1540000000043000ULL, 0x0000000000100000ULL, 0x2e88a22220200a20ULL, 0xc06030e34
ea503a0ULL, |
| 3483 0x0001100204048500ULL, 0x000000e0000c0d54ULL, 0x3000820310a31400ULL, 0x13088c032
0e00280ULL, |
| 3484 0x0000000000000000ULL, 0x0480000000200000ULL, 0x4000200100000000ULL, 0x000030004
0040000ULL, |
| 3485 0x4400000000000000ULL, 0x0401000002240000ULL, 0x0540000000040000ULL, 0x400401000
0000000ULL, |
| 3486 0x4001111001100000ULL, 0x2880000000300040ULL, 0x4040004040002404ULL, 0x020000000
0000000ULL, |
| 3487 0x0140040000100000ULL, 0x4040010040040080ULL, 0x0a00140000041004ULL, 0x0000a0040
0808000ULL, |
| 3488 0x1010200000430040ULL, 0x0010000000000000ULL, 0x0540000000104000ULL, 0x140011400
5000000ULL, |
| 3489 0x0000204000440010ULL, 0x0500000000004400ULL, 0x4500000018000400ULL, 0x000040000
0000000ULL, |
| 3490 0x000000300000cc00ULL, 0x0100001011300000ULL, 0x0040000000000000ULL, 0xc0e000024
8a00444ULL, |
| 3491 0x0000040020340144ULL, 0x0000046445105454ULL, 0x32a0a80280880128ULL, 0x088004000
0100100ULL, |
| 3492 0x0000000000000000ULL, 0x14003000030c0004ULL, 0x4a04001100000000ULL, 0x0a0010801
0000000ULL, |
| 3493 0x28a8004000200248ULL, 0x0100040000b00000ULL, 0x42000000000008c0ULL, 0x600804401
0550010ULL, |
| 3494 0x0800401000010400ULL, 0x080080040cf80000ULL, 0x5080000001001010ULL, 0x2a8010000
0000000ULL, |
| 3495 0xcc8010010d401100ULL, 0x0200000001001000ULL, 0x0480001004001000ULL, 0x8d0080004
0b40210ULL, |
| 3496 0x6200800000300000ULL, 0x0000010000000000ULL, 0x0428004100010000ULL, 0x432010514
1501100ULL, |
| 3497 0xe28c0000000c1000ULL, 0xd5c000c3c0e00300ULL, 0x0001000000100200ULL, 0x100401020
2400008ULL, |
| 3498 0x0000000000003000ULL, 0x2aa038a0800aab08ULL, 0x2a88038000000000ULL, 0xc22004024
2f09720ULL, |
| 3499 0x8020200200ba0420ULL, 0x0020106105101004ULL, 0x0480800000220400ULL, 0x228010008
0000008ULL, |
| 3500 0x0000000000000000ULL, 0x9000000000200000ULL, 0x0001000000100000ULL, 0x2aa40c000
0080800ULL, |
| 3501 0x0040000040010000ULL, 0x0040000000c01000ULL, 0x4000000040000400ULL, 0x000000100
0200000ULL, |
| 3502 0x0000010000000000ULL, 0x05808004000c0000ULL, 0x50400c0000000400ULL, 0x020040008
f000040ULL, |
| 3503 0x0800000000100000ULL, 0x0000000000000000ULL, 0x0a08440000004000ULL, 0x006400040
0008200ULL, |
| 3504 0x0010010010034170ULL, 0x0000000010000000ULL, 0x0100204021000000ULL, 0x022000d00
0010100ULL, |
| 3505 0x0840300000c00000ULL, 0x1400000040204400ULL, 0x09800c0040000000ULL, 0x020970800
0000000ULL, |
| 3506 0x000000000000c040ULL, 0x90000c50204040a0ULL, 0x0000000000000000ULL, 0x00e150004
0200004ULL, |
| 3507 0x8020260540204494ULL, 0x0020026150201054ULL, 0x0281800380105634ULL, 0x088490048
1105000ULL, |
| 3508 0x0000000000000000ULL, 0x84203c00002c0200ULL, 0xc089040000000000ULL, 0xc2a810004
0200004ULL, |
| 3509 0xe00c1c0000000000ULL, 0x0ce1330080200080ULL, 0x0000000000200000ULL, 0xc40011000
0404010ULL, |
| 3510 0x0088400000000000ULL, 0x00083cc00c00c00cULL, 0xcac01c00c000580cULL, 0xe300b0f00
0100000ULL, |
| 3511 0x0300000000000000ULL, 0xc0000f0000000000ULL, 0xc3c01c0400000000ULL, 0x81008004c
0f40000ULL, |
| 3512 0xc3d8003000000440ULL, 0x0000000000000000ULL, 0xc430000000000000ULL, 0x006000000
0001000ULL, |
| 3513 0x0800000000000000ULL, 0x00c03300f0fc0008ULL, 0x3000000400200010ULL, 0xa2a80892a
0880a28ULL, |
| 3514 0x0500000040000004ULL, 0x0000000000000000ULL, 0xc80032070c200020ULL, 0x022082006
0a296a0ULL, |
| 3515 0x802084021db486a0ULL, 0x00000d60080c0080ULL, 0xb281803313a32428ULL, 0x180830032
0300000ULL, |
| 3516 0x0000000000000000ULL, 0x85208cc0ccac1f20ULL, 0x2081000186100808ULL, 0x22a808800
00a0808ULL, |
| 3517 0xaaa8086880000000ULL, 0x802084800a2e9200ULL, 0xa280000000002008ULL, 0xa00000008
0080400ULL, |
| 3518 0x2080010000000008ULL, 0x802020c00c028c80ULL, 0x2080000000140810ULL, 0x2a8008608
0080008ULL, |
| 3519 0x2a800000a8000800ULL, 0xaa881800a2080800ULL, 0xaa98004080280808ULL, 0x004483d0c
0300000ULL, |
| 3520 0xa280002080080000ULL, 0x0000000000300000ULL, 0x22a1030000000008ULL, 0xa8a030108
8880880ULL, |
| 3521 0xaa80002080222808ULL, 0x85400c03fc030400ULL, 0x8a88000000000008ULL, 0xa00800801
0080008ULL, |
| 3522 0x0000000000010000ULL, 0x0040100000301040ULL, 0x28800000a0002008ULL, 0x122482306
cbc0eacULL, |
| 3523 0x8020224222b8c6a0ULL, 0x802002004a82c284ULL, 0x0aa08fc440a41c80ULL, 0x888080d18
1385098ULL, |
| 3524 0x0000000000000000ULL, 0x00c0b000000c0080ULL, 0x2208001000000800ULL, 0x0a2800000
0200000ULL, |
| 3525 0x0000000300000000ULL, 0x00c1040000200000ULL, 0x0203020000000000ULL, 0x024800000
0020000ULL, |
| 3526 0x0000840000100000ULL, 0x0a808c00c000008cULL, 0x5200040040000004ULL, 0x02000c000
00080a0ULL, |
| 3527 0x0b0c000020000000ULL, 0x0b04000001000000ULL, 0x088c0010002000c0ULL, 0x80e08b00c
0030c20ULL, |
| 3528 0x0280000200014040ULL, 0x0000000000000000ULL, 0x0e20a0a008000020ULL, 0x0e280fd03
f00111cULL, |
| 3529 0x200080c020001000ULL, 0x8cc00c02c02f0400ULL, 0x480c0001000c404cULL, 0x020801428
1080808ULL, |
| 3530 0x000000000000fcfcULL, 0x004403300cf00030ULL, 0x2200000000004400ULL, 0x02202000c
08c0c20ULL, |
| 3531 0x02202022683a80a0ULL, 0x4020228028008c00ULL, 0x32208cc0002c0200ULL, 0x3ec00c008
0304008ULL, |
| 3532 0x0000000000000000ULL, 0x34000c00002c0000ULL, 0x0b00000100100030ULL, 0x082301800
0000000ULL, |
| 3533 0x0e8c001c01e00000ULL, 0x1200800600330000ULL, 0x4000110000000000ULL, 0x008000030
0000000ULL, |
| 3534 0x0800000000000000ULL, 0x08c08c04000c0000ULL, 0x0080400000880000ULL, 0x0a0800008
0c00008ULL, |
| 3535 0x0800000304400000ULL, 0x0208000000c00000ULL, 0x2888300080400800ULL, 0x8dc020440
0000000ULL, |
| 3536 0xc0000000c0800000ULL, 0x0000c10000000000ULL, 0x24000c4010c00000ULL, 0x272000541
d811000ULL, |
| 3537 0x0200400000001000ULL, 0x0400000400001004ULL, 0xc08c007004001000ULL, 0x204800400
0000000ULL, |
| 3538 0x000000000003fcfcULL, 0x2aa030000cf8c800ULL, 0xe280000000000000ULL, 0x0a2100814
2000340ULL, |
| 3539 0x0021002000b61040ULL, 0x800004064006d444ULL, 0x3aa0800300230008ULL, 0x0b0003000
0300000ULL, |
| 3540 0x0000000000000000ULL, 0x01c080000000040cULL, 0x0100000000004000ULL, 0x0aa801801
0001000ULL, |
| 3541 0x0800000000100000ULL, 0x3000000000008c00ULL, 0x5400000013000000ULL, 0x02c0c0000
4004010ULL, |
| 3542 0x5241100010000c00ULL, 0x0e00080000000808ULL, 0x5281000000000800ULL, 0x0a0810802
0000800ULL, |
| 3543 0x0a80000000005210ULL, 0x0100000041000000ULL, 0x2a88000002080110ULL, 0x852080000
0c00080ULL, |
| 3544 0x01000010108c0100ULL, 0x0000000000000000ULL, 0x42a0420080000000ULL, 0x002000100
4010010ULL, |
| 3545 0xc4000000000c0000ULL, 0x01000c00c0200400ULL, 0x4600000100000000ULL, 0x000000000
0000000ULL, |
| 3546 0x0010001000000010ULL, 0x910400900820d030ULL, 0x2280000000000000ULL, 0xc22120044
00040e4ULL, |
| 3547 0x8001000000b61420ULL, 0xa00002a248e810b4ULL, 0x32008000002c0008ULL, 0x0c0100348
03c5010ULL, |
| 3548 0x0000000000000000ULL, 0x85008002002c0000ULL, 0x0204001000004010ULL, 0x012000800
0200000ULL, |
| 3549 0x000010000c2000c0ULL, 0xccc0000000200000ULL, 0x0400000c00100040ULL, 0x000330010
0004100ULL, |
| 3550 0x4000551040000004ULL, 0x0e0080000c820808ULL, 0xc000000000080800ULL, 0xc80300000
0000000ULL, |
| 3551 0x0a4000c000200000ULL, 0x0040000000c00000ULL, 0x0918145000405000ULL, 0x81400000c
0300400ULL, |
| 3552 0x0050000000000000ULL, 0xd000045000000000ULL, 0x0400004000400000ULL, 0x042010401
0000110ULL, |
| 3553 0x0700000000203000ULL, 0x34800300c0e00704ULL, 0x4440100044000400ULL, 0x004000004
0000000ULL, |
| 3554 0x0030000044000000ULL, 0xeaaca0008808c880ULL, 0x0a01000000200000ULL, 0x1220a3004
03ccf20ULL, |
| 3555 0x002024c200b61044ULL, 0x802014346aa2d434ULL, 0x30008c00c0820c44ULL, 0x0a0000000
00c4800ULL, |
| 3556 0x0000000000000000ULL, 0x0000404000340c90ULL, 0x08a8a10820800280ULL, 0x812800902
2201000ULL, |
| 3557 0x0020808228a000a0ULL, 0x0020400100410000ULL, 0x0400000110000000ULL, 0xa60900000
0200000ULL, |
| 3558 0x8008330000d00000ULL, 0x8060100040404010ULL, 0xeaa00ea0ea00808cULL, 0x200c8020a
0000020ULL, |
| 3559 0x0408800020200000ULL, 0x0189001403200000ULL, 0xc00800000000c000ULL, 0x200430c00
c300000ULL, |
| 3560 0x0100300100004000ULL, 0x0000040000000000ULL, 0x2420000400001000ULL, 0x89a120040
0000000ULL, |
| 3561 0x20c8a000208c0000ULL, 0x8080000000000000ULL, 0x28a0108020210080ULL, 0xa2a84800a
0880988ULL, |
| 3562 0x258008000400c000ULL, 0x0140000000100000ULL, 0xa028a222a0aa0228ULL, 0xc06001205
4044040ULL, |
| 3563 0x0010010400000000ULL, 0x00000050150c0114ULL, 0x0000008010c20010ULL, 0xaa088000a
0200880ULL, |
| 3564 0x0000000000000000ULL, 0x0700b0c0000c0000ULL, 0x2200040000080030ULL, 0x2aa880804
0240800ULL, |
| 3565 0x08b0500000000100ULL, 0x1000830400200000ULL, 0x4204000010000000ULL, 0x40c220005
0040050ULL, |
| 3566 0x0104404001010000ULL, 0x1a808c8103c00030ULL, 0x30900010c0000b00ULL, 0x200812b28
3000008ULL, |
| 3567 0x000c000020e00000ULL, 0x2140000000400000ULL, 0x0288000080200000ULL, 0x8060a200c
8a20280ULL, |
| 3568 0x0400114010215000ULL, 0x0000000000000000ULL, 0x082b200002000010ULL, 0x22a003000
0031000ULL, |
| 3569 0x008100001000000cULL, 0x05400c00c0230400ULL, 0xca3000003c080100ULL, 0x000000002
0000004ULL, |
| 3570 0x0000000100000000ULL, 0x8004320813f5c000ULL, 0xa280080200000800ULL, 0xc22000044
e334c20ULL, |
| 3571 0x000004146e361024ULL, 0x800126806aa0d584ULL, 0xb000a0040023c41cULL, 0x0a0830008
03053d8ULL, |
| 3572 0x0000000000000000ULL, 0x0000100000020000ULL, 0x0000000010000010ULL, 0x000000004
5040004ULL, |
| 3573 0x0000000000100000ULL, 0x0000020400000010ULL, 0x0003015000000000ULL, 0x040000000
0000000ULL, |
| 3574 0x0000000400000000ULL, 0x0100000000000800ULL, 0x0000001000000000ULL, 0x000000000
0000000ULL, |
| 3575 0x0000000040000000ULL, 0x0000000000000000ULL, 0x0004001000000000ULL, 0x000800100
0000000ULL, |
| 3576 0x0010000000000004ULL, 0x0000010100001000ULL, 0x0004000000000004ULL, 0x000001404
0050014ULL, |
| 3577 0x0014000000000040ULL, 0x5540000000041000ULL, 0x0000000000000000ULL, 0x000004000
0000d00ULL, |
| 3578 0x0000000000000000ULL, 0x0000000000100000ULL, 0x0001000000000000ULL, 0x000000000
0000000ULL, |
| 3579 0x0000000000000000ULL, 0x0000000000000000ULL, 0x4500000000040400ULL, 0x000080000
0000400ULL, |
| 3580 0x0000000000000000ULL, 0x13e080000020000cULL, 0xcf00001005100000ULL, 0x04a800800
0200300ULL, |
| 3581 0x00280100100000c0ULL, 0x1c8c000040200000ULL, 0x0600005000100000ULL, 0x050800000
c104000ULL, |
| 3582 0x4c10101000110000ULL, 0x0c00000000300000ULL, 0x22040c00100000c0ULL, 0x080070001
0100000ULL, |
| 3583 0x0000000000001000ULL, 0x0a08000010000040ULL, 0x0800034004210010ULL, 0x04e000040
0000000ULL, |
| 3584 0x0800030020000000ULL, 0x0000005000000000ULL, 0x0400110101304110ULL, 0x042800001
0a01000ULL, |
| 3585 0x060b000000800010ULL, 0x35810c00c020c000ULL, 0x00800c4321800000ULL, 0x420808802
0000080ULL, |
| 3586 0x040000111003ff00ULL, 0x0020900020202080ULL, 0x22888180a8000888ULL, 0x022520054
2005420ULL, |
| 3587 0x2020040400340020ULL, 0x10300424500cc444ULL, 0x3081a00400e00200ULL, 0x33001300c
0300000ULL, |
| 3588 0x0000000000000000ULL, 0x04003c0000000000ULL, 0x0a04001000100100ULL, 0x140800000
1000000ULL, |
| 3589 0x1800000044100000ULL, 0x3400040400000300ULL, 0x5000040801000040ULL, 0x408840104
0000040ULL, |
| 3590 0x1010110130100000ULL, 0xca800c3000300000ULL, 0x5a01000000080100ULL, 0x020280000
cd01300ULL, |
| 3591 0x0302000410200010ULL, 0x0000102000300000ULL, 0x0b09000000000000ULL, 0x20008004c
4800004ULL, |
| 3592 0x28c0410010000000ULL, 0x0004015041000050ULL, 0x0a01006000200200ULL, 0x0020d0000
0100040ULL, |
| 3593 0x0010a00100900000ULL, 0x3500bf00c0030300ULL, 0x080c010000200d00ULL, 0x224800000
4020010ULL, |
| 3594 0x0000c00000000000ULL, 0x8044b00200e08000ULL, 0xaaa82aa2aa8a2aa8ULL, 0x022000224
1c08604ULL, |
| 3595 0x4200260440328444ULL, 0x68001226103008b4ULL, 0x3a0080c0b0000400ULL, 0x2a8048048
03c4008ULL, |
| 3596 0x0000000000000000ULL, 0x04008c0300000400ULL, 0x008000c0000c0000ULL, 0x088001000
000001cULL, |
| 3597 0x0840000001000010ULL, 0x0400000000200c00ULL, 0x4244000101040000ULL, 0x423800701
1100000ULL, |
| 3598 0x1000d00100000010ULL, 0x1d00800400300000ULL, 0x4204080c00000000ULL, 0x2a8808008
0000008ULL, |
| 3599 0x08001c0200001000ULL, 0x0a00000400000000ULL, 0x8a88003080080000ULL, 0x052180040
0300000ULL, |
| 3600 0x3200051000201000ULL, 0x0000000000000000ULL, 0x0020801404000000ULL, 0x322010401
c0c101cULL, |
| 3601 0x0c01100013000000ULL, 0x04003000c0204000ULL, 0x088c0020a0cc0000ULL, 0x220000008
0000018ULL, |
| 3602 0x0404000044000000ULL, 0x82a0b000008820b0ULL, 0x0000040020440000ULL, 0xc26500044
03f1420ULL, |
| 3603 0x0021340241b64464ULL, 0x8020040242c2d474ULL, 0x32018c0480288000ULL, 0x00800b008
0300000ULL, |
| 3604 0x0000000000000000ULL, 0x05008c0000040130ULL, 0xc0d8000000800000ULL, 0x002000002
0200200ULL, |
| 3605 0x23a2000120204000ULL, 0x5052100550104150ULL, 0x1000101100040000ULL, 0xc40001c30
1000000ULL, |
| 3606 0x8288000000c00000ULL, 0x5150040144d01404ULL, 0xea8c0ea028ae088cULL, 0xc31010c00
0000c80ULL, |
| 3607 0x0002000060000000ULL, 0xc80800f030000000ULL, 0x0000000400300000ULL, 0xc00080c00
ff0c344ULL, |
| 3608 0x00080001200c0000ULL, 0x0000050080000000ULL, 0x0328000300300000ULL, 0x082030000
cc01040ULL, |
| 3609 0xeb08800100004000ULL, 0x8030003300c80f00ULL, 0xfb0d0000e4ac0000ULL, 0x002000608
0000008ULL, |
| 3610 0x0500100100040000ULL, 0x1140000000000000ULL, 0xcb883330a0e00000ULL, 0xc00001005
0000080ULL, |
| 3611 0x0010104005b54150ULL, 0x40111d5155001554ULL, 0x80000070140f0004ULL, 0x0b0830c3a
0003380ULL, |
| 3612 0x0000000000000000ULL, 0x04c13000000f830cULL, 0x2808000000000000ULL, 0x281000000
0000800ULL, |
| 3613 0x08c0080004400000ULL, 0x04c0240300801c20ULL, 0x4040000080000004ULL, 0x000040010
0100010ULL, |
| 3614 0x020001008000c0c0ULL, 0x1d008c000c3c0000ULL, 0x0080003000000800ULL, 0x228808008
0000008ULL, |
| 3615 0x0a84004020220000ULL, 0x0800080000100000ULL, 0xaa80004080400008ULL, 0x802400040
0c01660ULL, |
| 3616 0x80841c2001000104ULL, 0x0001000000000000ULL, 0x0020028020020280ULL, 0x086040401
1900100ULL, |
| 3617 0xec80080200000000ULL, 0x010103c100200400ULL, 0x0200004000000000ULL, 0x000000000
0400400ULL, |
| 3618 0x000010000003fcfcULL, 0x8040083238c20000ULL, 0x08800220a0920a00ULL, 0x082100044
83c0c24ULL, |
| 3619 0xc020240740b0a200ULL, 0x802006014a201494ULL, 0x3201233070ac0e00ULL, 0x080028060
33a48a0ULL, |
| 3620 0x0000000000000000ULL, 0x8020820028a00680ULL, 0x2000002000000104ULL, 0x22a808011
00a0808ULL, |
| 3621 0xa2a8002080000000ULL, 0xa000800008a08000ULL, 0x0000100000400000ULL, 0x800000210
0000000ULL, |
| 3622 0x0000010000004404ULL, 0xa2a0088080000888ULL, 0x0000000010400800ULL, 0xa28008208
0080008ULL, |
| 3623 0x2280000080010008ULL, 0x2000000000000000ULL, 0x228800008c080808ULL, 0x802182800
2a98200ULL, |
| 3624 0xa200002000080000ULL, 0x0000040000000000ULL, 0x22a0000080000000ULL, 0x202882c20
0800080ULL, |
| 3625 0xa000000001004000ULL, 0x000000c808a00600ULL, 0x0000000010000000ULL, 0x000001000
000040cULL, |
| 3626 0x0000000000000000ULL, 0x802002a2a8aa82a0ULL, 0x20000024a8088228ULL, 0x802082000
1000000ULL, |
| 3627 0x8020000000808280ULL, 0x8000000000000000ULL, 0x0020800000200280ULL, 0x208008228
0a00888ULL, |
| 3628 0x0000000000000000ULL, 0x0000015000000040ULL, 0x0000040000040000ULL, 0x010001001
0001000ULL, |
| 3629 0x0000003210008000ULL, 0x0000000404000000ULL, 0x0000000000000400ULL, 0x020000000
0000000ULL, |
| 3630 0x0000000000000100ULL, 0x5180014400004050ULL, 0x1000000014000000ULL, 0x420000000
0000000ULL, |
| 3631 0x0040200000000000ULL, 0x0201004000000000ULL, 0x0a00000000000010ULL, 0x004020000
0800000ULL, |
| 3632 0x0040051000000500ULL, 0x0000000100800400ULL, 0x6000000000000000ULL, 0x000000000
0000000ULL, |
| 3633 0x280000c1400040ccULL, 0x4180001000000000ULL, 0x00000000c1000104ULL, 0x000000000
0000000ULL, |
| 3634 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0080000000c00000ULL, 0x000400606
6004000ULL, |
| 3635 0x0000005000040440ULL, 0x0000106005804044ULL, 0x0000a10511004440ULL, 0x000000000
0000110ULL, |
| 3636 0x0000000000000000ULL, 0x0000000000080000ULL, 0xeb0808a020800080ULL, 0x29a800810
02a1800ULL, |
| 3637 0x0b2c000202100100ULL, 0x0001000000888000ULL, 0x2280102010000000ULL, 0x020000602
a004110ULL, |
| 3638 0x8a800160a6108100ULL, 0x0280000000000020ULL, 0x8a8000a0a8808208ULL, 0x028088208
0500308ULL, |
| 3639 0x0b18010020804100ULL, 0xeb080000c0080080ULL, 0x2b08000000810130ULL, 0x000000000
8040020ULL, |
| 3640 0xaa0a08e082894140ULL, 0x0000000000000000ULL, 0x202081409010001cULL, 0x8aa880508
2806000ULL, |
| 3641 0xeb082900289c0000ULL, 0x0000000000008000ULL, 0xf80c2e20002e0000ULL, 0xa28808042
0880888ULL, |
| 3642 0x0000010000000000ULL, 0x0000000000102000ULL, 0x22880000a8a80808ULL, 0x022022a22
aa880a0ULL, |
| 3643 0x0000222222aa0620ULL, 0x0000022002800000ULL, 0x208080004028a000ULL, 0x2b8888008
01c0828ULL, |
| 3644 0x0000000000000000ULL, 0x22e0828280a08028ULL, 0xaa88002082080308ULL, 0x0ea800804
10a0040ULL, |
| 3645 0x2a28222000a00000ULL, 0x8aa2808028a0a2a0ULL, 0x0200001000000000ULL, 0x82080000a
0000000ULL, |
| 3646 0x8800000082000808ULL, 0x2a008a0000300888ULL, 0x0a80080080080808ULL, 0xaa8828008
40b0808ULL, |
| 3647 0x0a80000080000040ULL, 0xea080820a0000000ULL, 0xaa88080080080808ULL, 0x8040a2800
a8024a0ULL, |
| 3648 0xaa800020a0080808ULL, 0x0000040000000000ULL, 0x2a280a0080080880ULL, 0x2a2008108
0008a00ULL, |
| 3649 0x2a88882088aa0008ULL, 0x81800202c0a01480ULL, 0xea88082082200000ULL, 0xaa8800208
0080008ULL, |
| 3650 0x0000100000000000ULL, 0x802082a22aa0a2a0ULL, 0x2e80000000000000ULL, 0x0220a2a26
aa0a2a8ULL, |
| 3651 0x800022a2228a22a0ULL, 0x880002212e82c0b0ULL, 0x02a0aa0002a82228ULL, 0x2d808b008
0380008ULL, |
| 3652 0x0000000000000000ULL, 0x000407551c154244ULL, 0x2a00208088a02228ULL, 0x12a82182a
2402a88ULL, |
| 3653 0xe32821e020826d00ULL, 0x801130100ccc1330ULL, 0x028010c000841008ULL, 0x88a08002a
0a664a0ULL, |
| 3654 0x0048270080000100ULL, 0x00001f010cd10f30ULL, 0xe2242ce22aaea2a0ULL, 0xc2c00cc20
ae22460ULL, |
| 3655 0xe208003128021c10ULL, 0x2a2021c010821080ULL, 0x2a88202082202020ULL, 0x401011110
4941410ULL, |
| 3656 0xc80c02c182b00080ULL, 0x0000040000000000ULL, 0xe28030068002c300ULL, 0x2aa02024a
2a22228ULL, |
| 3657 0xe20889328aa22080ULL, 0x0000000000210100ULL, 0xaa0028e0a9b221a0ULL, 0x200000808
0400000ULL, |
| 3658 0x0000010041150404ULL, 0x0000105114410100ULL, 0xeaa82aa6aaaaaaa8ULL, 0x000000f44
300c434ULL, |
| 3659 0x0000222222b00020ULL, 0x0000002000000000ULL, 0x0000004014000000ULL, 0x0039b3f73
fbcd3fcULL, |
| 3660 0x0000000000000000ULL, 0x0000104015045040ULL, 0x20a80490a08800a0ULL, 0x40a825841
0a909a0ULL, |
| 3661 0xe0a8a2022aa2e2a0ULL, 0xc111010014000500ULL, 0x2080044041840004ULL, 0x28a820022
0a2aba0ULL, |
| 3662 0x008400a0a2840800ULL, 0x0101015451009464ULL, 0x20000ea0e02c2c2cULL, 0xe2a828a2a
ca2aaa8ULL, |
| 3663 0x682020a228a222a0ULL, 0xe8882ae22aa2a2a0ULL, 0xe9a80e6022a24140ULL, 0x001105500
5001040ULL, |
| 3664 0x2aa8208229a0aaa4ULL, 0x0000040000000000ULL, 0x28a0228026a62260ULL, 0xe2a020a42
2a2a020ULL, |
| 3665 0xe808a0022aa1a220ULL, 0x0000010014000100ULL, 0x28ac22802aa2a020ULL, 0x002000000
0000000ULL, |
| 3666 0x0100010100040000ULL, 0x0000000000000000ULL, 0x22a822a22a8aaaa0ULL, 0x000000000
0000000ULL, |
| 3667 0x0000102410800100ULL, 0x0000000000000000ULL, 0x0000000002000000ULL, 0x00000fb2a
08c0aa8ULL, |
| 3668 0x0000000000000000ULL, 0x4010005015440140ULL, 0x18c81c00b180001cULL, 0x280004802
1820800ULL, |
| 3669 0x8ab820c06a802580ULL, 0x00100170f4040000ULL, 0x4000144041041404ULL, 0x0ac800d00
02e440cULL, |
| 3670 0x20880820a2000808ULL, 0x400000f03f300c00ULL, 0xaa000ea22aa22aa0ULL, 0xa2880ac0a
8942a20ULL, |
| 3671 0xaa880a81a1804188ULL, 0xeea022a0aaa02080ULL, 0xaaa820a2aaa66120ULL, 0x000000511
5800150ULL, |
| 3672 0x2a880920a0840040ULL, 0x0000040000000000ULL, 0xaea82222aaa22a28ULL, 0x8a2804126
0055150ULL, |
| 3673 0xa28824008aa28880ULL, 0x0000025014019000ULL, 0xea882ae02aa200a0ULL, 0x000000000
0000000ULL, |
| 3674 0x0000000040000400ULL, 0x0000000000000000ULL, 0xaaa82aa22aaaaaa0ULL, 0x000000000
0000000ULL, |
| 3675 0x0000000000000000ULL, 0x002003003c80c000ULL, 0x0000020014000000ULL, 0x00200010a
0980a20ULL, |
| 3676 0x0000000000000000ULL, 0x0020001200801240ULL, 0x0a88000089800020ULL, 0xcaa00080a
1000000ULL, |
| 3677 0x0a200c0020a04080ULL, 0x4002034003840880ULL, 0x4690500190000050ULL, 0x222800400
0601000ULL, |
| 3678 0x0a803f00803f400cULL, 0x400033e24dd0cf34ULL, 0xaa80a2a229a220a0ULL, 0x0a2240000
02c0000ULL, |
| 3679 0x028000202000008cULL, 0x0a08000070000030ULL, 0x00800c040020000cULL, 0x000000000
2850000ULL, |
| 3680 0x02881cc310200000ULL, 0x0000040004000000ULL, 0xcba8000400000080ULL, 0xcaa02c068
0000000ULL, |
| 3681 0xcc880002008c4080ULL, 0x300000f007f0cf0cULL, 0x0a80001080a00000ULL, 0x820880802
a880a80ULL, |
| 3682 0x0000050001040004ULL, 0x0000011000000000ULL, 0x0a8020a2a0202000ULL, 0x000002220
2008000ULL, |
| 3683 0x0000222212808000ULL, 0x0020226010000000ULL, 0x000033f33ff3c33cULL, 0x00288002a
08c02a8ULL, |
| 3684 0x0000000000000000ULL, 0x04408e0000008200ULL, 0x0808004000900000ULL, 0x0aa820001
0ca00c0ULL, |
| 3685 0x0ba80101005d4010ULL, 0x00018604802c8288ULL, 0x00049400101c0000ULL, 0x000c10111
0505010ULL, |
| 3686 0x0000000000100000ULL, 0x30000c00c022000cULL, 0xd0c00dd0d51d431cULL, 0x000800001
0100000ULL, |
| 3687 0x000c1001a0280000ULL, 0x0bc80000c0000000ULL, 0x0a00000080280000ULL, 0x8000a0022
0308420ULL, |
| 3688 0x0808000010301000ULL, 0x0000040000000000ULL, 0x0d00031480100000ULL, 0x072000001
08c0300ULL, |
| 3689 0x0bc0a0c000004000ULL, 0x8000b002c0208480ULL, 0x340c0100118c111cULL, 0x800800802
0890000ULL, |
| 3690 0x0000000000040010ULL, 0x0020b00320c1d0b0ULL, 0x00002000000c0000ULL, 0x0020be226
e2008a0ULL, |
| 3691 0x002010c03fb0a6a0ULL, 0x00202e222aaec284ULL, 0x00008f0000208400ULL, 0x000000000
0300000ULL, |
| 3692 }; |
| 3693 // Latin1 6%, Latin2 11%, Latin7 3% |
| 3694 |
| 3695 |
| 3696 |
| 3697 // Just for debugging. not thread-safe |
| 3698 static char tri_string[4]; |
| 3699 char* Latin127Str(int trisub) { |
| 3700 tri_string[0] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 10) & 0x1f]; |
| 3701 tri_string[1] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 5) & 0x1f]; |
| 3702 tri_string[2] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 0) & 0x1f]; |
| 3703 tri_string[3] = '\0'; |
| 3704 return tri_string; |
| 3705 } |
| 3706 |
| 3707 // Returns two bits per three-byte trigram, indicating |
| 3708 // dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely |
| 3709 int TrigramValue(const uint8* trisrc) { |
| 3710 int byte0_p = kMapToFiveBits[trisrc[0]]; |
| 3711 int byte1_p = kMapToFiveBits[trisrc[1]]; |
| 3712 int byte2_p = kMapToFiveBits[trisrc[2]]; |
| 3713 int subscr = ((byte0_p) << 5) | byte1_p; |
| 3714 int temp = static_cast<int>((kLatin127Trigrams[subscr] >> (byte2_p * 2))); |
| 3715 //printf("%s=%d ", Latin127Str((subscr << 5) | byte2_p), temp & 3); |
| 3716 return temp & 3; |
| 3717 } |
| 3718 |
| 3719 |
| 3720 // Put out trigrams for surrounding 32 bytes for Latin encodings |
| 3721 // Return true if more Latin2 & 7 than Latin1 |
| 3722 bool BoostLatin127Trigrams(int tri_block_offset, |
| 3723 DetectEncodingState* destatep) { |
| 3724 //printf("BoostLatin127Trigrams[%06x]\n", tri_block_offset); |
| 3725 int excess_latin27 = 0; |
| 3726 int srclen = destatep->limit_src - destatep->initial_src; |
| 3727 int hi_limit = minint(tri_block_offset + 32, srclen - 2); |
| 3728 const uint8* trisrc = &destatep->initial_src[tri_block_offset]; |
| 3729 const uint8* trisrclimit = &destatep->initial_src[hi_limit]; |
| 3730 while (trisrc < trisrclimit) { |
| 3731 // Selectively boost Latin1, Latin2, or Latin7 and friends |
| 3732 int trigram_val = TrigramValue(trisrc); |
| 3733 if (trigram_val != 0) { |
| 3734 if (FLAGS_enc_detect_source) { |
| 3735 PsHighlight(trisrc, destatep->initial_src, trigram_val, 1); |
| 3736 } |
| 3737 if (trigram_val == kTriLatin1Likely) { |
| 3738 Boost(destatep, F_Latin1, kTrigramBoost); |
| 3739 Boost(destatep, F_CP1252, kTrigramBoost); |
| 3740 // We don't want to upset the relative rank of a declared 8859-15 |
| 3741 Boost(destatep, F_ISO_8859_15, kTrigramBoost); |
| 3742 --excess_latin27; |
| 3743 } else if (trigram_val == kTriLatin2Likely) { |
| 3744 Boost(destatep, F_Latin2, kTrigramBoost); |
| 3745 Boost(destatep, F_CP1250, kTrigramBoost); |
| 3746 ++excess_latin27; |
| 3747 } else if (trigram_val == kTriLatin7Likely) { |
| 3748 Boost(destatep, F_ISO_8859_13, kTrigramBoost); |
| 3749 Boost(destatep, F_CP1257, kTrigramBoost); |
| 3750 // We don't want to upset the relative rank of a declared 8859-4 or -6 |
| 3751 // for Estonian |
| 3752 Boost(destatep, F_Latin4, kTrigramBoost); |
| 3753 Boost(destatep, F_Latin6, kTrigramBoost); |
| 3754 ++excess_latin27; |
| 3755 } |
| 3756 } |
| 3757 |
| 3758 ++trisrc; |
| 3759 } |
| 3760 //printf("\n"); |
| 3761 |
| 3762 return (0 < excess_latin27); |
| 3763 } |
| 3764 |
| 3765 |
| 3766 |
| 3767 // Boost any encodings that need extra detection help, then prune |
| 3768 // src is first unscanned byte |
| 3769 // slowend means extra pruning when dropping out of initial slow scan |
| 3770 // final means last call -- no bigram at src |
| 3771 void BoostPrune(const uint8* src, DetectEncodingState* destatep, |
| 3772 int prunereason) { |
| 3773 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] - |
| 3774 destatep->prior_interesting_pair[AsciiPair]; |
| 3775 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] - |
| 3776 destatep->prior_interesting_pair[OtherPair]; |
| 3777 |
| 3778 if (prunereason == PRUNE_FINAL) { |
| 3779 // We are about done |
| 3780 // If we get here with very little accumulated data, the initial hints |
| 3781 // were too strong, so we derate them to n+1 / 12 for n bigrams |
| 3782 if (!destatep->hints_derated && |
| 3783 (destatep->next_interesting_pair[OtherPair] < kDerateHintsBelow)) { |
| 3784 int n = destatep->next_interesting_pair[OtherPair]; |
| 3785 |
| 3786 // Map N pairs to (N+1)/12 portions of the initial hints, etc. |
| 3787 // Floor of 3/12 -- 1/12 and 2/12 are too easy to overcome |
| 3788 int m = maxint(3, (n + 1)); |
| 3789 for (int i = 0; i < NUM_RANKEDENCODING; ++i) { |
| 3790 int original_delta = destatep->hint_prob[i]; |
| 3791 int scaled_delta = (original_delta * m) / kDerateHintsBelow; |
| 3792 destatep->enc_prob[i] -= original_delta; |
| 3793 destatep->enc_prob[i] += scaled_delta; |
| 3794 } |
| 3795 destatep->hints_derated = true; |
| 3796 if (destatep->debug_data != NULL) { |
| 3797 // Show derated-hint result |
| 3798 char buff[32]; |
| 3799 snprintf(buff, sizeof(buff), "Hints %d/%d", m, kDerateHintsBelow); |
| 3800 SetDetailsEncLabel(destatep, buff); |
| 3801 } |
| 3802 } |
| 3803 } |
| 3804 |
| 3805 |
| 3806 ++destatep->prune_count; |
| 3807 |
| 3808 if (prunereason != PRUNE_FINAL) { |
| 3809 // Early outs |
| 3810 if (destatep->rankedencoding_list_len <= 1) { // nothing to prune |
| 3811 destatep->done = true; |
| 3812 return; |
| 3813 } |
| 3814 |
| 3815 if ((destatep->prune_count > 0) && |
| 3816 (delta_asciipairs + delta_otherpairs) == 0) { |
| 3817 // Nothing to do; must have just been called earlier |
| 3818 return; |
| 3819 } |
| 3820 } |
| 3821 |
| 3822 |
| 3823 |
| 3824 // INCREMENT |
| 3825 // ==================== |
| 3826 // Accumulate OtherPair probibilities over all active families |
| 3827 // AsciiPair probibilities are all done in ActiveSpecialBoostWhack |
| 3828 uint8 prior_bad_byte1 = ' '; // won't match first bad pair |
| 3829 uint8 prior_bad_byte2 = ' '; // won't match first bad pair |
| 3830 uint8 or_byte1 = 0; // Track if any current pair has a high bit |
| 3831 int counted_otherpairs = 0; |
| 3832 uint8 prior_byte1x2x = 0; |
| 3833 for (int i = 0; i < delta_otherpairs; ++i) { |
| 3834 int watch1_incr = 0; |
| 3835 int watch2_incr = 0; |
| 3836 int next_pair = destatep->prior_interesting_pair[OtherPair] + i; |
| 3837 |
| 3838 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0]; |
| 3839 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1]; |
| 3840 uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f); |
| 3841 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair]; |
| 3842 |
| 3843 int offset_byte12 = destatep->interesting_offsets[OtherPair][next_pair]; |
| 3844 |
| 3845 // To help distinguish some Cyrillic, Arabic, Greek, Hebrew, Thai |
| 3846 // Remember if this is a CDEF pair immediately following the previous pair |
| 3847 // 8xxx CxCx or CxCx 8xxx |
| 3848 bool next_pair_consec_hi = false; |
| 3849 if (ConsecutivePair(destatep, next_pair)) { |
| 3850 if ((byte1x2x & 0xcc) == 0xcc) { // 8xxx CxCx |
| 3851 next_pair_consec_hi = true; |
| 3852 } else if ((prior_byte1x2x & 0xcc) == 0xcc) { // CxCx 8xxx |
| 3853 next_pair_consec_hi = true; |
| 3854 } |
| 3855 } |
| 3856 //printf("prior/cur/consec %02x %02x %d\n", |
| 3857 // prior_byte1x2x, byte1x2x, next_pair_consec_hi); |
| 3858 prior_byte1x2x = byte1x2x; |
| 3859 |
| 3860 or_byte1 |= byte1; |
| 3861 uint8 byte1f = byte1; |
| 3862 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew) |
| 3863 byte1f ^= (byte2 & 0x80); |
| 3864 |
| 3865 // If the same bigram occurred recently, don't increment again |
| 3866 bool pair_used = false; |
| 3867 if (!RepeatedBigram(destatep, byte1, byte2)) { |
| 3868 ++counted_otherpairs; |
| 3869 pair_used = true; |
| 3870 // Boost both charset= declared encodings, so |
| 3871 // Nearly-same probability nearby encoding doesn't drift to the top |
| 3872 if (!FLAGS_demo_nodefault) { |
| 3873 destatep->enc_prob[destatep->declared_enc_1] += kDeclaredEncBoost >> wei
ghtshift; |
| 3874 destatep->enc_prob[destatep->declared_enc_2] += kDeclaredEncBoost >> wei
ghtshift; |
| 3875 } |
| 3876 bool was_bad_pair = false; |
| 3877 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 3878 int incr_shift = 0; |
| 3879 int rankedencoding = destatep->rankedencoding_list[j]; |
| 3880 Encoding enc = kMapToEncoding[rankedencoding]; |
| 3881 |
| 3882 // For binary, Skip over repeated marker bytes, such as 02, FF, etc. |
| 3883 if ((rankedencoding == F_BINARY) && |
| 3884 RepeatedBinary(destatep, byte1, byte2)) { |
| 3885 incr_shift = 2; // count 1/4 as much if repeated |
| 3886 } |
| 3887 |
| 3888 // If byte 1x2x for this encoding is exactly zero, illegal byte pair |
| 3889 // Don't increment, but instead penalize |
| 3890 const UnigramEntry* ue = &unigram_table[rankedencoding]; |
| 3891 if (ue->b12[byte1x2x] == 0) { |
| 3892 // Don't whack consecutive duplicate bad pairs -- overkill |
| 3893 if ((byte1 != prior_bad_byte1) || (byte2 != prior_bad_byte2)) { |
| 3894 // Extra whack for illegal pair in this encoding |
| 3895 Whack(destatep, rankedencoding, kBadPairWhack >> weightshift); |
| 3896 was_bad_pair = true; |
| 3897 } |
| 3898 } else { |
| 3899 // OK to do the real increment |
| 3900 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x]; |
| 3901 if ((ue->b12[byte1x2x] & 0x01) != 0) { |
| 3902 // Use a more-precise table |
| 3903 int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f); |
| 3904 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2 |
| 3905 DCHECK(ue->hires[hiressub] != NULL); |
| 3906 incr += ue->hires[hiressub][byte32x32]; |
| 3907 } else { |
| 3908 // Default final offset |
| 3909 incr += ue->so; |
| 3910 } |
| 3911 incr >>= incr_shift; |
| 3912 |
| 3913 incr >>= weightshift; |
| 3914 destatep->enc_prob[rankedencoding] += incr; // The actual increment |
| 3915 |
| 3916 if (FLAGS_enc_detect_detail2) { |
| 3917 if (watch1_rankedenc == rankedencoding) {watch1_incr = incr;} |
| 3918 if (watch2_rankedenc == rankedencoding) {watch2_incr = incr;} |
| 3919 } |
| 3920 } |
| 3921 |
| 3922 |
| 3923 // If consecutive pair of high bytes, give slight boost to one-byte |
| 3924 // encodings that have a full alphabet in the high bytes |
| 3925 if (next_pair_consec_hi && HighAlphaEncoding(enc)) { |
| 3926 Boost(destatep, rankedencoding, kDeclaredEncBoost >> weightshift); |
| 3927 } |
| 3928 } // End for j < rankedencoding_list_len |
| 3929 |
| 3930 if (was_bad_pair) { |
| 3931 prior_bad_byte1 = byte1; |
| 3932 prior_bad_byte2 = byte2; |
| 3933 } |
| 3934 |
| 3935 // Fold in per-bigram most likely encoding for first N bigrams |
| 3936 if (next_pair < kBestPairsCount) { |
| 3937 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2]; |
| 3938 Boost(destatep, best_enc, kBestEncBoost >> weightshift); |
| 3939 } |
| 3940 |
| 3941 // Possibly score 32 trigrams around a bigram to better separate |
| 3942 // Latin1 from Latin2 and Latin7. Especially helpful for detecting |
| 3943 // mis-labelled Hungarian latin2. |
| 3944 // If looking and at bigram 0,8,16,... do full scoring, else just 1 tri |
| 3945 if (destatep->do_latin_trigrams || |
| 3946 destatep->looking_for_latin_trigrams) { |
| 3947 // If just looking, do full scan every 8 times |
| 3948 // Just look up one trigram the other 7 and do full scan if Latin2,7 |
| 3949 bool scan32 = false; |
| 3950 const uint8* trisrc = &destatep->initial_src[offset_byte12 - 1]; |
| 3951 if (!destatep->do_latin_trigrams) { |
| 3952 if ((i & 7) == 0 || trisrc + 3 > destatep->limit_src) { |
| 3953 scan32 = true; |
| 3954 } else { |
| 3955 scan32 = (kTriLatin1Likely < TrigramValue(trisrc)); |
| 3956 } |
| 3957 } |
| 3958 if (destatep->do_latin_trigrams || scan32) { |
| 3959 // Just score each block of 32 bytes once |
| 3960 int tri_block_offset = offset_byte12 & ~0x1f; |
| 3961 if (destatep->trigram_highwater_mark <= tri_block_offset) { |
| 3962 bool turnon = BoostLatin127Trigrams(tri_block_offset, destatep); |
| 3963 if (FLAGS_counts && !destatep->do_latin_trigrams && turnon) { |
| 3964 ++doing_used; // First time |
| 3965 } |
| 3966 if (FLAGS_enc_detect_source) { |
| 3967 if (!destatep->do_latin_trigrams && turnon) { |
| 3968 // First time |
| 3969 PsHighlight(trisrc, destatep->initial_src, 0, 2); |
| 3970 } |
| 3971 } |
| 3972 destatep->do_latin_trigrams |= turnon; |
| 3973 destatep->trigram_highwater_mark = tri_block_offset + 32; |
| 3974 } |
| 3975 } |
| 3976 } |
| 3977 |
| 3978 } // end if RepeatedBigram() |
| 3979 |
| 3980 // Keep track of initial byte high 3 bits |
| 3981 ++destatep->byte32_count[byte1 >> 5]; |
| 3982 |
| 3983 |
| 3984 // TODO: boost subset/superset also |
| 3985 // Boost(destatep, kRelatedEncoding[best_enc], kBestEncBoost); |
| 3986 |
| 3987 if (destatep->debug_data != NULL) { |
| 3988 // Show detail entry for this bigram |
| 3989 char buff[16]; |
| 3990 snprintf(buff, sizeof(buff), "%c%02x%02x%c%c", |
| 3991 pair_used ? ' ' : '[', |
| 3992 byte1, |
| 3993 byte2, |
| 3994 pair_used ? ' ' : ']', |
| 3995 (weightshift == 0) ? ' ' : '-'); |
| 3996 |
| 3997 SetDetailsEncProb(destatep, |
| 3998 destatep->interesting_offsets[OtherPair][next_pair], |
| 3999 kMostLikelyEncoding[(byte1 << 8) + byte2], |
| 4000 buff); |
| 4001 } |
| 4002 if (FLAGS_enc_detect_detail2) { |
| 4003 if ((watch1_incr != 0) || (watch2_incr != 0)) { |
| 4004 // Show increment detail for this encoding |
| 4005 char buff[32]; |
| 4006 snprintf(buff, sizeof(buff), "%c%d %c%d", |
| 4007 (watch1_incr < 0) ? '-' : '+', watch1_incr, |
| 4008 (watch2_incr < 0) ? '-' : '+', watch2_incr); |
| 4009 SetDetailsEncLabel(destatep, buff); |
| 4010 } |
| 4011 } |
| 4012 } // End for i |
| 4013 |
| 4014 |
| 4015 // If no high bit on, demote all the two-byte codes |
| 4016 // WAS BUG. This was inside the loop above and should be outside |
| 4017 if ((counted_otherpairs > 0) && ((or_byte1 & 0x80) == 0)) { |
| 4018 // No high bit in this group (just 02xx, etc.). Whack 2-byte codes |
| 4019 // This keeps SJS from creeping past Latin1 on illegal C0 bytes |
| 4020 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 4021 int rankedencoding = destatep->rankedencoding_list[j]; |
| 4022 Encoding enc = kMapToEncoding[rankedencoding]; |
| 4023 if (TwoByteEncoding(enc)) { |
| 4024 Whack(destatep, rankedencoding, kGentlePairWhack * counted_otherpairs); |
| 4025 } |
| 4026 } |
| 4027 } |
| 4028 |
| 4029 |
| 4030 // BOOST |
| 4031 // ==================== |
| 4032 if (AnyActive(destatep)) { |
| 4033 ActiveSpecialBoostWhack(src, destatep); |
| 4034 } |
| 4035 |
| 4036 // Update for next time |
| 4037 destatep->prior_src = src; |
| 4038 destatep->prior_interesting_pair[AsciiPair] = |
| 4039 destatep->next_interesting_pair[AsciiPair]; |
| 4040 destatep->prior_interesting_pair[OtherPair] = |
| 4041 destatep->next_interesting_pair[OtherPair]; |
| 4042 |
| 4043 |
| 4044 // Do any pre-prune final adjustments |
| 4045 // ==================== |
| 4046 if (prunereason == PRUNE_FINAL) { |
| 4047 // If UTF8 not in base state, whack |
| 4048 if (destatep->next_utf8_ministate != 0) { |
| 4049 Whack(destatep, F_UTF8, kGentlePairWhack * 2 * 1); |
| 4050 } |
| 4051 // If UTF8UTF8 not in base state, whack |
| 4052 if (destatep->next_utf8utf8_ministate != 0) { |
| 4053 Whack(destatep, F_UTF8UTF8, kGentlePairWhack * 2 * 1); |
| 4054 } |
| 4055 |
| 4056 // If no valid UTF-8 char ever seen, whack |
| 4057 if (destatep->utf8_minicount[5] == 0) { |
| 4058 Whack(destatep, F_UTF8, kBadPairWhack * 8); // No sequence |
| 4059 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence |
| 4060 } |
| 4061 |
| 4062 // If no valid UTF8UTF8 char ever seen, whack |
| 4063 if (destatep->utf8utf8_minicount[5] == 0) { |
| 4064 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence |
| 4065 } |
| 4066 |
| 4067 // If not all four binary quadrants, whack BINARY; |
| 4068 // worth 2 pair if 3 quads, 4 pair if 1 or 2 quads |
| 4069 if (destatep->binary_quadrants_count < 4) { |
| 4070 if (destatep->binary_quadrants_count == 3) { |
| 4071 Whack(destatep, F_BINARY, kBadPairWhack * 2); |
| 4072 } else { |
| 4073 Whack(destatep, F_BINARY, kBadPairWhack * 4); |
| 4074 } |
| 4075 } |
| 4076 |
| 4077 // If 1st pair is 1b24, choose between ISO-2022-xx |
| 4078 // <esc> $ ) C ISO-2022-KR [1b 24 29 43] |
| 4079 // <esc> $ ) A ISO-2022-CN [1b 24 29 41] |
| 4080 // <esc> $ ) G ISO-2022-CN [1b 24 29 47] |
| 4081 // <esc> $ * H ISO-2022-CN [1b 24 2a 48] |
| 4082 // <esc> ( B ISO-2022-JP [1b 28 42] to ASCII |
| 4083 // <esc> ( J ISO-2022-JP [1b 28 4a] to X0201 |
| 4084 // <esc> $ @ ISO-2022-JP [1b 24 40] to X0208-78 twobyte |
| 4085 // <esc> $ B ISO-2022-JP [1b 24 42] to X0208-83 twobyte |
| 4086 if ((destatep->next_interesting_pair[OtherPair] >= 1) && |
| 4087 Iso2022Active(destatep)) { |
| 4088 if ((destatep->interesting_pairs[OtherPair][0] == 0x1b) && |
| 4089 (destatep->interesting_pairs[OtherPair][1] == 0x24)) { |
| 4090 int offset = destatep->interesting_offsets[OtherPair][0]; |
| 4091 const uint8* esc_src = destatep->initial_src + offset; |
| 4092 if ((destatep->initial_src + offset) < (destatep->limit_src - 3)) { |
| 4093 if ((esc_src[2] == ')') && (esc_src[3] == 'C')) { |
| 4094 Boost(destatep, F_ISO_2022_KR, kBoostOnePair); |
| 4095 Whack(destatep, F_ISO_2022_CN, kBadPairWhack); |
| 4096 Whack(destatep, F_JIS, kBadPairWhack); |
| 4097 } else if ((esc_src[2] == ')') && ((esc_src[3] == 'A') || |
| 4098 (esc_src[3] == 'G'))) { |
| 4099 Boost(destatep, F_ISO_2022_CN, kBoostOnePair); |
| 4100 Whack(destatep, F_ISO_2022_KR, kBadPairWhack); |
| 4101 Whack(destatep, F_JIS, kBadPairWhack); |
| 4102 } else if ((esc_src[2] == '@') || (esc_src[2] == 'B')) { |
| 4103 Boost(destatep, F_JIS, kBoostOnePair); |
| 4104 Whack(destatep, F_ISO_2022_CN, kBadPairWhack); |
| 4105 Whack(destatep, F_ISO_2022_KR, kBadPairWhack); |
| 4106 } |
| 4107 } else { |
| 4108 // Incomplete escape sequence. Whack them all |
| 4109 Whack(destatep, F_JIS, kBadPairWhack); |
| 4110 Whack(destatep, F_ISO_2022_CN, kBadPairWhack); |
| 4111 Whack(destatep, F_ISO_2022_KR, kBadPairWhack); |
| 4112 } |
| 4113 } |
| 4114 } |
| 4115 if (destatep->debug_data != NULL) { |
| 4116 SetDetailsEncLabel(destatep, "pre-final"); |
| 4117 } |
| 4118 } |
| 4119 |
| 4120 // PRUNE |
| 4121 // ==================== |
| 4122 // Find current top two rankedencoding probabilities |
| 4123 ReRank(destatep); |
| 4124 |
| 4125 if (prunereason == PRUNE_SLOWEND) { |
| 4126 if (destatep->debug_data != NULL) { |
| 4127 SetDetailsEncLabel(destatep, "slow-end"); |
| 4128 } |
| 4129 } |
| 4130 |
| 4131 // Keep every rankedencoding with probablity >= top_prob - prune_difference |
| 4132 int prune_diff = destatep->prune_difference; |
| 4133 // If the top encoding is BINARY, it might be overstated, and we might |
| 4134 // therefore prune away the real encoding. Make the pruning delta |
| 4135 // twice as big. |
| 4136 if (destatep->top_rankedencoding == F_BINARY) { |
| 4137 prune_diff *= 2; |
| 4138 } |
| 4139 int keep_prob = destatep->top_prob - prune_diff; |
| 4140 |
| 4141 // Tighten pruning difference (we start wide) for next time |
| 4142 if (destatep->prune_difference > kFinalPruneDifference) { |
| 4143 int decrement = kPruneDiffDecrement; |
| 4144 // If only ASCII pairs, small tighten; if some non-ASCII, full tighten |
| 4145 if (counted_otherpairs == 0) { |
| 4146 decrement >>= 1; |
| 4147 } |
| 4148 destatep->prune_difference -= decrement; |
| 4149 } |
| 4150 |
| 4151 // Prune the list of active encoding families |
| 4152 destatep->active_special = 0; |
| 4153 int k = 0; |
| 4154 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 4155 bool keep = true; |
| 4156 int rankedencoding = destatep->rankedencoding_list[j]; |
| 4157 |
| 4158 // If count is too low, ditch it |
| 4159 if (destatep->enc_prob[rankedencoding] < keep_prob) { |
| 4160 keep = false; |
| 4161 } |
| 4162 |
| 4163 // If at end of slow section, ditch any 7-bit with zero evidence so far |
| 4164 if ((prunereason == PRUNE_SLOWEND) && |
| 4165 SevenBitEncoding(kMapToEncoding[rankedencoding]) && |
| 4166 (destatep->enc_prob[rankedencoding] <= 0) && |
| 4167 (rankedencoding != destatep->top_rankedencoding)) { |
| 4168 keep = false; |
| 4169 } |
| 4170 |
| 4171 // Keep it. This will always keep at least top_prob rankedencoding |
| 4172 if (keep) { |
| 4173 destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]]; |
| 4174 destatep->rankedencoding_list[k++] = rankedencoding; |
| 4175 } |
| 4176 } |
| 4177 |
| 4178 if (destatep->debug_data != NULL) { |
| 4179 char buff[32]; |
| 4180 snprintf(buff, sizeof(buff), "%d prune", prune_diff / XLOG2); |
| 4181 SetDetailsEncLabel(destatep, buff); |
| 4182 } |
| 4183 destatep->rankedencoding_list_len = k; |
| 4184 |
| 4185 |
| 4186 |
| 4187 // Force final result in some cases |
| 4188 // Do any post-prune final adjustments |
| 4189 if (prunereason == PRUNE_FINAL) { |
| 4190 // If no high-byte pairs, result is ASCII7, BINARY, UTF7, 2022, or HZ |
| 4191 if (destatep->next_interesting_pair[OtherPair] == 0) { |
| 4192 if ((destatep->top_rankedencoding != F_BINARY) && |
| 4193 (destatep->top_rankedencoding != F_UTF7) && |
| 4194 (destatep->top_rankedencoding != F_ISO_2022_CN) && |
| 4195 (destatep->top_rankedencoding != F_ISO_2022_KR) && |
| 4196 (destatep->top_rankedencoding != F_JIS) && |
| 4197 (destatep->top_rankedencoding != F_HZ_GB_2312)) { |
| 4198 destatep->top_rankedencoding = F_ASCII_7_bit; |
| 4199 Boost(destatep, F_ASCII_7_bit, kBoostOnePair * 2); |
| 4200 } |
| 4201 } |
| 4202 |
| 4203 // If some 89 pairs, not ISO_8859_x and vice versa |
| 4204 if (destatep->byte32_count[4] > 0) { |
| 4205 switch (destatep->top_rankedencoding) { |
| 4206 case F_ASCII: // ISO-8859-1 |
| 4207 destatep->top_rankedencoding = F_CP1252; |
| 4208 // Better: destatep->enc_prob[F_ASCII] <==> destatep->enc_prob[F_CP1252] |
| 4209 Boost(destatep, F_CP1252, kBoostOnePair * 2); |
| 4210 break; |
| 4211 case F_Latin2: // ISO-8859-2 |
| 4212 // Don't swap back; not superset |
| 4213 //destatep->top_rankedencoding = F_CP1250; |
| 4214 //Boost(destatep, F_CP1250, kBoostOnePair * 2); |
| 4215 break; |
| 4216 case F_Arabic: // ISO-8859-6 |
| 4217 destatep->top_rankedencoding = F_CP1256; |
| 4218 Boost(destatep, F_CP1256, kBoostOnePair * 2); |
| 4219 break; |
| 4220 case F_Greek: // ISO-8859-7 |
| 4221 // Don't swap -- not proper superset |
| 4222 // Capital Alpha tonos at 0xB6 in ISO-8859-7, 0xA2 in CP1253 |
| 4223 //destatep->top_rankedencoding = F_CP1253; |
| 4224 //Boost(destatep, F_CP1253, kBoostOnePair * 2); |
| 4225 break; |
| 4226 case F_Hebrew: // ISO-8859-8 |
| 4227 // Don't swap -- visual vs. logical |
| 4228 //destatep->top_rankedencoding = F_CP1255; |
| 4229 //Boost(destatep, F_CP1255, kBoostOnePair * 2); |
| 4230 break; |
| 4231 case F_Latin5: // ISO-8859-9 |
| 4232 destatep->top_rankedencoding = F_CP1254; |
| 4233 Boost(destatep, F_CP1254, kBoostOnePair * 2); |
| 4234 break; |
| 4235 case F_ISO_8859_11: // ISO-8859-11 |
| 4236 destatep->top_rankedencoding = F_CP874; |
| 4237 Boost(destatep, F_CP874, kBoostOnePair * 2); |
| 4238 break; |
| 4239 } |
| 4240 } else { |
| 4241 switch (destatep->top_rankedencoding) { |
| 4242 case F_CP1252: // ISO-8859-1 |
| 4243 destatep->top_rankedencoding = F_ASCII; |
| 4244 Boost(destatep, F_ASCII, kBoostOnePair * 2); |
| 4245 break; |
| 4246 case F_CP1250: // ISO-8859-2 |
| 4247 // Don't swap back; not superset |
| 4248 //destatep->top_rankedencoding = F_Latin2; |
| 4249 //Boost(destatep, F_Latin2, kBoostOnePair * 2); |
| 4250 break; |
| 4251 case F_CP1256: // ISO-8859-6 |
| 4252 // Don't swap back -- not proper superset |
| 4253 //destatep->top_rankedencoding = F_Arabic; |
| 4254 //Boost(destatep, F_Arabic, kBoostOnePair * 2); |
| 4255 break; |
| 4256 case F_CP1253: // ISO-8859-7 |
| 4257 // Don't swap back -- not proper superset |
| 4258 //destatep->top_rankedencoding = F_Greek; |
| 4259 //Boost(destatep, F_Greek, kBoostOnePair * 2); |
| 4260 break; |
| 4261 case F_CP1255: // ISO-8859-8 |
| 4262 // Don't swap back -- not proper superset |
| 4263 //destatep->top_rankedencoding = F_Hebrew; |
| 4264 //Boost(destatep, F_Hebrew, kBoostOnePair * 2); |
| 4265 break; |
| 4266 case F_CP1254: // ISO-8859-9 |
| 4267 destatep->top_rankedencoding = F_Latin5; |
| 4268 Boost(destatep, F_Latin5, kBoostOnePair * 2); |
| 4269 break; |
| 4270 case F_CP874: // ISO-8859-11 |
| 4271 destatep->top_rankedencoding = F_ISO_8859_11; |
| 4272 Boost(destatep, F_ISO_8859_11, kBoostOnePair * 2); |
| 4273 break; |
| 4274 } |
| 4275 } |
| 4276 |
| 4277 if (destatep->debug_data != NULL) { |
| 4278 char buff[32]; |
| 4279 snprintf(buff, sizeof(buff), "final %d", |
| 4280 static_cast<int>(src - destatep->initial_src)); |
| 4281 SetDetailsEncLabel(destatep, buff); |
| 4282 |
| 4283 // Show winning encoding and its delta log base2 from 2nd-best |
| 4284 // Divide delta by XLOG2 to get log base 2 |
| 4285 int delta = destatep->top_prob - destatep->second_top_prob; |
| 4286 if (delta < (2 * XLOG2)) { |
| 4287 delta /= XDECILOG2; |
| 4288 snprintf(buff, sizeof(buff), "+%d.%d %s ", |
| 4289 delta / 10, delta % 10, |
| 4290 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding])); |
| 4291 } else if (delta < (50 * XLOG2)) { |
| 4292 delta /= XLOG2; |
| 4293 snprintf(buff, sizeof(buff), "+%d %s", |
| 4294 delta, |
| 4295 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding])); |
| 4296 } else { |
| 4297 snprintf(buff, sizeof(buff), "%s", |
| 4298 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding])); |
| 4299 } |
| 4300 SetDetailsEncProbCopyOffset(destatep, destatep->top_rankedencoding, buff); |
| 4301 } |
| 4302 } |
| 4303 |
| 4304 |
| 4305 // FINISH |
| 4306 // ==================== |
| 4307 // Eventual encoding result is reliable if big difference in top two, or if |
| 4308 // only Ascii7 ever encountered |
| 4309 // Also reliable if exactly one OtherPair and it's best encoding matches top |
| 4310 destatep->reliable = false; |
| 4311 if (destatep->next_interesting_pair[OtherPair] == 0) { |
| 4312 // Only 7-bit ASCII |
| 4313 destatep->reliable = true; |
| 4314 } |
| 4315 if ((destatep->top_prob - destatep->second_top_prob) >= |
| 4316 FLAGS_ced_reliable_difference) { |
| 4317 destatep->reliable = true; |
| 4318 } |
| 4319 if (destatep->next_interesting_pair[OtherPair] == 1) { |
| 4320 uint8 byte1 = destatep->interesting_pairs[OtherPair][0]; |
| 4321 uint8 byte2 = destatep->interesting_pairs[OtherPair][1]; |
| 4322 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2]; |
| 4323 if (best_enc == destatep->top_rankedencoding) { |
| 4324 destatep->reliable = true; |
| 4325 } |
| 4326 } |
| 4327 |
| 4328 // If we pruned to one encoding, we are done |
| 4329 if (destatep->rankedencoding_list_len == 1) { |
| 4330 destatep->reliable = true; |
| 4331 destatep->done = true; |
| 4332 } |
| 4333 |
| 4334 // If we pruned to two or three encodings in the same *superset/subset |
| 4335 // rankedencoding* and enough pairs, we are done. Else keep going |
| 4336 if (destatep->rankedencoding_list_len == 2) { |
| 4337 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]]; |
| 4338 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]]; |
| 4339 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) { |
| 4340 if (destatep->prune_count >= 3) { |
| 4341 destatep->reliable = true; |
| 4342 destatep->done = true; |
| 4343 } |
| 4344 } |
| 4345 } else if (destatep->rankedencoding_list_len == 3) { |
| 4346 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]]; |
| 4347 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]]; |
| 4348 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]]; |
| 4349 Encoding base0 = kMapEncToBaseEncoding[enc0]; |
| 4350 Encoding base1 = kMapEncToBaseEncoding[enc1]; |
| 4351 Encoding base2 = kMapEncToBaseEncoding[enc2]; |
| 4352 |
| 4353 if ((base0 == base1) && (base0 == base2)) { |
| 4354 if (destatep->prune_count >= 3) { |
| 4355 destatep->reliable = true; |
| 4356 destatep->done = true; |
| 4357 } |
| 4358 } |
| 4359 } |
| 4360 } |
| 4361 |
| 4362 |
| 4363 // Accumulate aligned byte-pair at src |
| 4364 // Occasionally, calc boost for some encodings and then prune the active list |
| 4365 // weightshift is used to give low weight some text, such as inside tags |
| 4366 // Returns true if pruning occurred |
| 4367 bool IncrementAndBoostPrune(const uint8* src, |
| 4368 int remaining_length, |
| 4369 DetectEncodingState* destatep, |
| 4370 int weightshift, |
| 4371 int exit_reason) { |
| 4372 destatep->last_pair = src; |
| 4373 // Pick up byte pair, or very last byte plus 0x20 |
| 4374 uint8 byte1 = src[0]; |
| 4375 uint8 byte2 = 0x20; |
| 4376 if (1 < remaining_length) {byte2 = src[1];} |
| 4377 |
| 4378 // whatset=0 for Ascii + ~, 1 for all others; see kTestPrintableAsciiTildePlus |
| 4379 int whatset = exit_reason - 1; |
| 4380 int next_pair = destatep->next_interesting_pair[whatset]; |
| 4381 |
| 4382 if (next_pair > 16) { |
| 4383 // If not clear by 16 bigrams, stop accumulating + ~ 00 |
| 4384 if (byte1 == '+') {return false;} |
| 4385 if (byte1 == '~') {return false;} |
| 4386 if (byte1 == 0x00) {return false;} |
| 4387 } |
| 4388 |
| 4389 // Remember pair in appropriate list |
| 4390 if (next_pair >= kMaxPairs) { |
| 4391 // We have filled up our alloted space for interesting pairs with no |
| 4392 // decision. If ASCII pairs full, just skip until end of slow loop; if |
| 4393 // non-Ascii pairs full, force done |
| 4394 if (whatset == OtherPair) { |
| 4395 destatep->done = true; |
| 4396 } |
| 4397 } else { |
| 4398 int offset = static_cast<int>(src - destatep->initial_src); |
| 4399 destatep->interesting_pairs[whatset][next_pair * 2 + 0] = byte1; |
| 4400 destatep->interesting_pairs[whatset][next_pair * 2 + 1] = byte2; |
| 4401 destatep->interesting_offsets[whatset][next_pair] = offset; |
| 4402 destatep->interesting_weightshift[whatset][next_pair] = weightshift; |
| 4403 ++destatep->next_interesting_pair[whatset]; |
| 4404 ++next_pair; |
| 4405 } |
| 4406 |
| 4407 // Prune now and then , but always if forced to be done |
| 4408 if (destatep->done || ((next_pair & kPruneMask) == 0)) { // Prune every M |
| 4409 BoostPrune(src + 2, destatep, PRUNE_NORMAL); // src+2 first unscanned byte |
| 4410 // may be off end of input |
| 4411 return true; |
| 4412 } |
| 4413 return false; |
| 4414 } |
| 4415 |
| 4416 void DumpSummary(DetectEncodingState* destatep, int whatset, int n) { |
| 4417 printf(" %sSummary[%2d]: ", kWhatSetName[whatset], |
| 4418 destatep->next_interesting_pair[whatset]); |
| 4419 int limit = minint(n, destatep->next_interesting_pair[whatset]); |
| 4420 for (int i = 0; i < limit; ++i) { |
| 4421 printf("%02x%02x ", |
| 4422 destatep->interesting_pairs[whatset][i * 2 + 0], |
| 4423 destatep->interesting_pairs[whatset][i * 2 + 1]); |
| 4424 if ((i & 7) == 7) {printf(" ");} |
| 4425 } |
| 4426 printf("\n"); |
| 4427 } |
| 4428 |
| 4429 void BeginDetail(DetectEncodingState* destatep) { |
| 4430 fprintf(stderr, "%d [", NUM_RANKEDENCODING); |
| 4431 for (int e = 0; e < NUM_RANKEDENCODING; ++e) { |
| 4432 fprintf(stderr, "(%s)", MyRankedEncName(e)); |
| 4433 if ((e % 10) == 9) {fprintf(stderr, "\n ");} |
| 4434 } |
| 4435 fprintf(stderr, "] size-detail\n"); |
| 4436 destatep->next_detail_entry = 0; |
| 4437 } |
| 4438 |
| 4439 // Single character to represent (printable ASCII) gap between bigrams |
| 4440 char DetailOffsetChar(int delta) { |
| 4441 if (delta == 0) {return ' ';} |
| 4442 if (delta <= 2) {return '=';} |
| 4443 if (delta <= 15) {return '_';} |
| 4444 if (delta <= 31) {return '+';} |
| 4445 {return ' ';} |
| 4446 } |
| 4447 |
| 4448 void DumpDetail(DetectEncodingState* destatep) { |
| 4449 // Turn all counts into delta from previous entry |
| 4450 fprintf(stderr, "%d count-detail\n", destatep->next_detail_entry); |
| 4451 // Rewrite, recording deltas |
| 4452 for (int z = destatep->next_detail_entry - 1; z > 0; --z) { |
| 4453 destatep->debug_data[z].offset -= destatep->debug_data[z - 1].offset; |
| 4454 for (int e = 0; e < NUM_RANKEDENCODING; ++e) { |
| 4455 destatep->debug_data[z].detail_enc_prob[e] -= |
| 4456 destatep->debug_data[z - 1].detail_enc_prob[e]; |
| 4457 } |
| 4458 } |
| 4459 // Now print |
| 4460 for (int z = 0; z < destatep->next_detail_entry; ++z) { |
| 4461 // Highlight some entries ending in '!' with light red underbar |
| 4462 int len = destatep->debug_data[z].label.size(); |
| 4463 if (destatep->debug_data[z].label[len - 1] == '!') { |
| 4464 fprintf(stderr, "1 0.9 0.9 do-flag\n"); |
| 4465 } |
| 4466 fprintf(stderr, "(%c%s) %d [", |
| 4467 DetailOffsetChar(destatep->debug_data[z].offset), |
| 4468 destatep->debug_data[z].label.c_str(), |
| 4469 destatep->debug_data[z].best_enc); |
| 4470 for (int e = 0; e < NUM_RANKEDENCODING; ++e) { |
| 4471 fprintf(stderr, "%d ", destatep->debug_data[z].detail_enc_prob[e]); |
| 4472 if ((e % 10) == 9) {fprintf(stderr, " ");} |
| 4473 } |
| 4474 fprintf(stderr, "] do-detail-e\n"); |
| 4475 } |
| 4476 // Get ready for next time,if any |
| 4477 destatep->next_detail_entry = 0; |
| 4478 } |
| 4479 |
| 4480 void PsRecurse(const char* buff) { |
| 4481 fprintf(stderr, "() end-detail (%s) start-detail\n\n", buff); |
| 4482 } |
| 4483 |
| 4484 void DumpReliable(DetectEncodingState* destatep) { |
| 4485 printf("Not reliable: "); |
| 4486 |
| 4487 // Find center of gravity of OtherPair list |
| 4488 int x_sum = 0; |
| 4489 int y_sum = 0; |
| 4490 int count = destatep->next_interesting_pair[OtherPair]; |
| 4491 for (int i = 0; i < count; ++i) { |
| 4492 uint8 byte1 = destatep->interesting_pairs[OtherPair][i * 2 + 0]; |
| 4493 uint8 byte2 = destatep->interesting_pairs[OtherPair][i * 2 + 1]; |
| 4494 x_sum += byte2; |
| 4495 y_sum += byte1; |
| 4496 } |
| 4497 if (count == 0) {count = 1;} // adoid zdiv |
| 4498 int x_bar = x_sum / count; |
| 4499 int y_bar = y_sum / count; |
| 4500 printf("center %02X,%02X\n", x_bar, y_bar); |
| 4501 |
| 4502 double closest_dist = 999.0; |
| 4503 int closest = 0; |
| 4504 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 4505 int rankedencoding = destatep->rankedencoding_list[j]; |
| 4506 const UnigramEntry* ue = &unigram_table[rankedencoding]; |
| 4507 printf(" %8s = %4d at %02x,%02x +/- %02X,%02X ", |
| 4508 MyEncodingName(kMapToEncoding[rankedencoding]), |
| 4509 destatep->enc_prob[rankedencoding], |
| 4510 ue->x_bar, ue->y_bar, |
| 4511 ue->x_stddev, ue->y_stddev); |
| 4512 double x_diff = x_bar - ue->x_bar; |
| 4513 double y_diff = y_bar - ue->y_bar; |
| 4514 double dist = sqrt((x_diff * x_diff) + (y_diff * y_diff)); |
| 4515 printf("(%3.1f)\n", dist); |
| 4516 |
| 4517 if (closest_dist > dist) { |
| 4518 closest_dist = dist; |
| 4519 closest = rankedencoding; |
| 4520 } |
| 4521 } |
| 4522 printf("Closest=%s (%3.1f)\n", |
| 4523 MyEncodingName(kMapToEncoding[closest]), closest_dist); |
| 4524 |
| 4525 for (int i = 0; i < 8; ++i) { |
| 4526 // Demote by distance to CG and see if that helps, or just quit |
| 4527 } |
| 4528 } |
| 4529 |
| 4530 // Scan short single lines quickly for all printable ASCII |
| 4531 // Return true if all bytes are in [20..7F], false otherwise |
| 4532 bool QuickPrintableAsciiScan(const char* text, int text_length) { |
| 4533 const uint8* src = reinterpret_cast<const uint8*>(text); |
| 4534 const uint8* srclimit = src + text_length; |
| 4535 const uint8* srclimit8 = srclimit - 7; |
| 4536 while (src < srclimit8) { |
| 4537 const uint32* s = reinterpret_cast<const uint32*>(src); |
| 4538 uint32 tmp1 = s[0]; |
| 4539 uint32 tmp2 = s[1]; |
| 4540 src += 8; |
| 4541 // Exits on any byte outside [0x20..0x7E] range (HT LF CR exit) |
| 4542 uint32 byte_outside_range_mask = ((tmp1 - 0x20202020U) | |
| 4543 (tmp1 + 0x01010101U) | |
| 4544 (tmp2 - 0x20202020U) | |
| 4545 (tmp2 + 0x01010101U)); |
| 4546 if ((byte_outside_range_mask & 0x80808080U) != 0) { |
| 4547 src -= 8; |
| 4548 break; |
| 4549 } |
| 4550 } |
| 4551 while (src < srclimit) { |
| 4552 uint8 uc = *src++; |
| 4553 if (kIsPrintableAscii[uc] == 0) {return false;} |
| 4554 } |
| 4555 return true; |
| 4556 } |
| 4557 |
| 4558 static const int kMaxScanBack = 192; |
| 4559 static const int kMaxScanForward = 64; |
| 4560 |
| 4561 // Return true if text is inside a tag or JS comment |
| 4562 bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) { |
| 4563 const uint8* srcbacklimit = src - kMaxScanBack; |
| 4564 if (srcbacklimit < isrc) { |
| 4565 srcbacklimit = isrc; |
| 4566 } |
| 4567 const uint8* ss = src - 1; |
| 4568 while (srcbacklimit <= ss) { |
| 4569 uint8 c = *ss--; |
| 4570 if ((c & ~0x02) == '<') { |
| 4571 // We found preceding < 3C or > 3E nearby |
| 4572 // Even cheaper: if inside a tag, we don't care what tag; return true |
| 4573 if (c == '<') { |
| 4574 return true; |
| 4575 } |
| 4576 // See if we are just after <title>... |
| 4577 if ((c == '>') && (isrc <= (ss - 5)) && |
| 4578 (ss[-5] == '<') && |
| 4579 ((ss[-4] | 0x20) == 't') && |
| 4580 ((ss[-3] | 0x20) == 'i') && |
| 4581 ((ss[-2] | 0x20) == 't') && |
| 4582 ((ss[-1] | 0x20) == 'l') && |
| 4583 ((ss[-0] | 0x20) == 'e')) { |
| 4584 return true; |
| 4585 } |
| 4586 // See if we are just after <SCRIPT language=javascript>... |
| 4587 if ((c == '>') && (isrc <= (ss - 5)) && |
| 4588 (ss[-5] == 's') && |
| 4589 ((ss[-4] | 0x20) == 'c') && |
| 4590 ((ss[-3] | 0x20) == 'r') && |
| 4591 ((ss[-2] | 0x20) == 'i') && |
| 4592 ((ss[-1] | 0x20) == 'p') && |
| 4593 ((ss[-0] | 0x20) == 't')) { |
| 4594 return true; |
| 4595 } |
| 4596 // Not in a tag |
| 4597 return false; |
| 4598 // See if we are just after JavaScript comment /* ... |
| 4599 } else if (c == '/') { |
| 4600 if (((ss + 2) < srclimit) && (ss[2] == '*')) { |
| 4601 // We backscanned to /* |
| 4602 return true; |
| 4603 } |
| 4604 } |
| 4605 } |
| 4606 |
| 4607 return false; |
| 4608 } |
| 4609 |
| 4610 const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl
imit) { |
| 4611 const uint8* ss = src + 1; |
| 4612 while (ss <= srclimit) { |
| 4613 uint8 c = *ss++; |
| 4614 if ((c == '<') || (c == '>')) { |
| 4615 return ss; |
| 4616 } |
| 4617 } |
| 4618 return src + 2; // Always make progress, Otherwise we get an infinite loop |
| 4619 } |
| 4620 |
| 4621 |
| 4622 // Take a watch string and map to a ranked encoding. If no match, return -1 |
| 4623 int LookupWatchEnc(const string& watch_str) { |
| 4624 int watchval = -1; |
| 4625 // Mixed encoding maps to enc=UTF8UTF8 |
| 4626 if (watch_str == "UTF8UTF8") { |
| 4627 watchval = F_UTF8UTF8; |
| 4628 } else { |
| 4629 Encoding enc; |
| 4630 if (EncodingFromName(watch_str.c_str(), &enc)) { |
| 4631 watchval = CompactEncDet::BackmapEncodingToRankedEncoding(enc); |
| 4632 } |
| 4633 } |
| 4634 return watchval; |
| 4635 } |
| 4636 |
| 4637 // Return true if enc and enc2 are equal or one is a subset of the other |
| 4638 // or either is UNKNOWN |
| 4639 // also UTF8UTF8 is compatible with both Latin1 and UTF8 |
| 4640 bool CompatibleEnc(Encoding enc, Encoding enc2) { |
| 4641 if (enc < 0) {return false;} |
| 4642 if (NUM_ENCODINGS <= enc) {return false;} |
| 4643 if (enc2 < 0) {return false;} |
| 4644 if (NUM_ENCODINGS <= enc2) {return false;} |
| 4645 if (enc == enc2) {return true;} |
| 4646 if (kMapEncToBaseEncoding[enc] == kMapEncToBaseEncoding[enc2]) {return true;} |
| 4647 |
| 4648 if (enc == ASCII_7BIT) {return true;} |
| 4649 if (enc2 == ASCII_7BIT) {return true;} |
| 4650 if (enc == UNKNOWN_ENCODING) {return true;} |
| 4651 if (enc2 == UNKNOWN_ENCODING) {return true;} |
| 4652 if (enc == UTF8UTF8) { |
| 4653 if (enc2 == UTF8) {return true;} |
| 4654 if (kMapEncToBaseEncoding[enc2] == ISO_8859_1) {return true;} |
| 4655 } |
| 4656 if (enc2 == UTF8UTF8) { |
| 4657 if (enc == UTF8) {return true;} |
| 4658 if (kMapEncToBaseEncoding[enc] == ISO_8859_1) {return true;} |
| 4659 } |
| 4660 |
| 4661 return false; |
| 4662 } |
| 4663 |
| 4664 // Return superset of enc and enc2, which must be compatible |
| 4665 Encoding SupersetEnc(Encoding enc, Encoding enc2) { |
| 4666 //printf(" SupersetEnc (%s, ", MyEncodingName(enc)); // TEMP |
| 4667 //printf("%s) ", MyEncodingName(enc2)); |
| 4668 //printf("= %s\n", |
| 4669 // MyEncodingName(kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2]
? |
| 4670 // enc :enc2)); |
| 4671 if (kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2]) { |
| 4672 return enc; |
| 4673 } |
| 4674 return enc2; |
| 4675 } |
| 4676 |
| 4677 |
| 4678 // If unreliable, try rescoring to separate some encodings |
| 4679 Encoding Rescore(Encoding enc, const uint8* isrc, |
| 4680 const uint8* srctextlimit, DetectEncodingState* destatep) { |
| 4681 if (FLAGS_counts) {++rescore_used;} |
| 4682 Encoding new_enc = enc; |
| 4683 |
| 4684 bool rescore_change = false; |
| 4685 |
| 4686 int count = destatep->next_interesting_pair[OtherPair]; |
| 4687 int text_length = srctextlimit - isrc; |
| 4688 for (int i = 0; i < count; ++i) { |
| 4689 int bigram_offset = destatep->interesting_offsets[OtherPair][i]; |
| 4690 uint8 byte0 = (0 < bigram_offset) ? |
| 4691 isrc[bigram_offset - 1] : 0x20; |
| 4692 uint8 byte1 = isrc[bigram_offset + 0]; // Known to have high bit on |
| 4693 uint8 byte2 = ((bigram_offset + 1) < text_length) ? |
| 4694 isrc[bigram_offset + 1] : 0x20; |
| 4695 uint8 byte3 = ((bigram_offset + 2) < text_length) ? |
| 4696 isrc[bigram_offset + 2] : 0x20; |
| 4697 int high_hash = ((byte0 & 0xc0) >> 0) | |
| 4698 ((byte1 & 0xc0) >> 1) | |
| 4699 ((byte2 & 0xc0) >> 4) | |
| 4700 ((byte3 & 0xc0) >> 6); // 00112233 |
| 4701 |
| 4702 // Boost HighAccent encodings for Ascii bit patterns |
| 4703 // 0x1x 0x0x |
| 4704 // 1010 1010 |
| 4705 // 0010 0000 |
| 4706 // |
| 4707 if ((high_hash & 0xaa) == 0x20) { |
| 4708 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 4709 int rankedencoding = destatep->rankedencoding_list[j]; |
| 4710 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) { |
| 4711 // TODO: also want to boost Shift-JIS here if byte1 is Ax..Dx |
| 4712 // TEMP |
| 4713 //printf(" Rescore[%02x] %s +%d\n", |
| 4714 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost)
; |
| 4715 Boost(destatep, rankedencoding, kGentlePairBoost); |
| 4716 rescore_change = true; |
| 4717 } |
| 4718 } |
| 4719 } |
| 4720 |
| 4721 // Whack HighAccent encodings for high bit patterns |
| 4722 // 1x1x 1x1x |
| 4723 // 1010 1010 |
| 4724 // 1010 1010 |
| 4725 // |
| 4726 if ((high_hash & 0xaa) == 0xaa) { |
| 4727 for (int j = 0; j < destatep->rankedencoding_list_len; j++) { |
| 4728 int rankedencoding = destatep->rankedencoding_list[j]; |
| 4729 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) { |
| 4730 // TEMP |
| 4731 //printf(" Rescore[%02x] %s -%d\n", |
| 4732 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost)
; |
| 4733 Whack(destatep, rankedencoding, kGentlePairBoost); |
| 4734 rescore_change = true; |
| 4735 } |
| 4736 } |
| 4737 } |
| 4738 |
| 4739 } |
| 4740 |
| 4741 if (rescore_change) { |
| 4742 ReRank(destatep); |
| 4743 new_enc = kMapToEncoding[destatep->top_rankedencoding]; |
| 4744 |
| 4745 if (destatep->debug_data != NULL) { |
| 4746 char buff[32]; |
| 4747 snprintf(buff, sizeof(buff), "=Rescore %s", MyEncodingName(new_enc)); |
| 4748 SetDetailsEncProb(destatep, |
| 4749 0, |
| 4750 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc), |
| 4751 buff); |
| 4752 //// DumpDetail(destatep); |
| 4753 } |
| 4754 |
| 4755 SimplePrune(destatep, kFinalPruneDifference); |
| 4756 CalcReliable(destatep); |
| 4757 } |
| 4758 |
| 4759 //if (new_enc != enc) { |
| 4760 // // TEMP |
| 4761 // printf(" Rescore new top encoding = %s\n", |
| 4762 // MyRankedEncName(destatep->top_rankedencoding)); |
| 4763 //} |
| 4764 |
| 4765 return new_enc; |
| 4766 } |
| 4767 |
| 4768 |
| 4769 // Given an encoding, add its corresponding ranked encoding to the set |
| 4770 void AddToSet(Encoding enc, int* list_len, int* list) { |
| 4771 // TEMP print |
| 4772 int item = CompactEncDet::BackmapEncodingToRankedEncoding(enc); |
| 4773 for (int i = 0; i < *list_len; ++i) { |
| 4774 if (list[i] == item) { |
| 4775 return; // Already in the set; don't add again |
| 4776 } |
| 4777 } |
| 4778 list[(*list_len)++] = item; |
| 4779 } |
| 4780 |
| 4781 |
| 4782 static const int kMinRobustBigramCount = 1000; |
| 4783 static const int kMinKBToRobustScan = 64; |
| 4784 static const int kMaxKBToRobustScan = 256; |
| 4785 |
| 4786 // Scan the first 64K or so, just doing raw bigram increments on given |
| 4787 // probability list. |
| 4788 // No fancy duplicate filtering or anything else here. |
| 4789 // Returns number of bigrams counted |
| 4790 int RobustScan(const char* text, |
| 4791 int text_length, |
| 4792 int robust_renc_list_len, |
| 4793 int* robust_renc_list, |
| 4794 int* robust_renc_probs) { |
| 4795 if (FLAGS_counts) {++robust_used;} |
| 4796 // Zero all the result probabilities |
| 4797 for (int i = 0; i < robust_renc_list_len; ++i) { |
| 4798 robust_renc_probs[i] = 0; |
| 4799 } |
| 4800 int max_fast_len = minint(text_length, (kMaxKBToRobustScan << 10)); |
| 4801 const uint8* isrc = reinterpret_cast<const uint8*>(text); |
| 4802 const uint8* src = isrc; |
| 4803 const uint8* srclimitfast2 = isrc + max_fast_len - 1; |
| 4804 const uint8* srclimitfast4 = isrc + max_fast_len - 3; |
| 4805 |
| 4806 int min_fast_len = minint(text_length, (kMinKBToRobustScan << 10)); |
| 4807 const uint8* srclimitmin = isrc + min_fast_len - 1; |
| 4808 |
| 4809 int bigram_count = 0; |
| 4810 |
| 4811 if (FLAGS_enc_detect_source) { |
| 4812 PsSourceInit(kPsSourceWidth); |
| 4813 fprintf(stderr, "(RobustScan) do-src\n"); |
| 4814 } |
| 4815 |
| 4816 // Sum over a big chunk of the input |
| 4817 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec |
| 4818 //==================================== |
| 4819 while (src < srclimitfast2) { |
| 4820 // Skip to next interesting bigram |
| 4821 while (src < srclimitfast4) { |
| 4822 uint32 u32 = *reinterpret_cast<const uint32*>(src); |
| 4823 src+= 4; |
| 4824 if ((u32 & 0x80808080) != 0) {src -= 4; break;} |
| 4825 } |
| 4826 while (src < srclimitfast2) { |
| 4827 uint8 uc = *src++; |
| 4828 if (static_cast<signed char>(uc) < 0) {src--; break;} |
| 4829 } |
| 4830 |
| 4831 if (src < srclimitfast2) { |
| 4832 // We found a bigram with high bit on |
| 4833 // Next 5 lines commented out so we don't show all the source. |
| 4834 //const uint8* srctextlimit = isrc + text_length; |
| 4835 //if (FLAGS_enc_detect_source) { |
| 4836 // PsSource(src, isrc, srctextlimit); |
| 4837 // PsMark(src, 2, isrc, 0); |
| 4838 //} |
| 4839 |
| 4840 uint8 byte1 = src[0]; |
| 4841 uint8 byte2 = src[1]; |
| 4842 uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f); |
| 4843 uint8 byte1f = byte1; |
| 4844 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebre
w) |
| 4845 byte1f ^= (byte2 & 0x80); |
| 4846 |
| 4847 // The real increments |
| 4848 for (int j = 0; j < robust_renc_list_len; ++j) { |
| 4849 int rankedencoding = robust_renc_list[j]; |
| 4850 const UnigramEntry* ue = &unigram_table[rankedencoding]; |
| 4851 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x]; |
| 4852 if ((ue->b12[byte1x2x] & 0x01) != 0) { |
| 4853 // Use a more-precise table |
| 4854 int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f); |
| 4855 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2 |
| 4856 DCHECK(ue->hires[hiressub] != NULL); |
| 4857 incr += ue->hires[hiressub][byte32x32]; |
| 4858 } else { |
| 4859 // Default final offset |
| 4860 incr += ue->so; |
| 4861 } |
| 4862 robust_renc_probs[j] += incr; |
| 4863 } |
| 4864 |
| 4865 src += 2; // Continue after this bigram |
| 4866 ++bigram_count; |
| 4867 |
| 4868 // Stop after 1000 bigrams reached, if at least 64KB scanned |
| 4869 if ((bigram_count > kMinRobustBigramCount) && (src > srclimitmin)) { |
| 4870 break; |
| 4871 } |
| 4872 } |
| 4873 } |
| 4874 |
| 4875 if (FLAGS_enc_detect_source) { |
| 4876 fprintf(stderr, "( bigram_count = %d) do-src\n", bigram_count); |
| 4877 if (bigram_count == 0) {bigram_count = 1;} // zdiv |
| 4878 for (int i = 0; i < robust_renc_list_len; ++i) { |
| 4879 fprintf(stderr, "( enc[%-12.12s] = %7d (avg %d)) do-src\n", |
| 4880 MyRankedEncName(robust_renc_list[i]), robust_renc_probs[i], |
| 4881 robust_renc_probs[i] / bigram_count); |
| 4882 } |
| 4883 PsSourceFinish(); |
| 4884 } |
| 4885 |
| 4886 return bigram_count; |
| 4887 } |
| 4888 |
| 4889 // If unreliable, rescan middle of document to see if we can get a better |
| 4890 // answer. Rescan is only worthwhile if there are ~200 bytes or more left, |
| 4891 // since the detector takes as much as 96 bytes of bigrams to decide. |
| 4892 Encoding Rescan(Encoding enc, |
| 4893 const uint8* isrc, |
| 4894 const uint8* src, |
| 4895 const uint8* srctextlimit, |
| 4896 const char* url_hint, |
| 4897 const char* http_charset_hint, |
| 4898 const char* meta_charset_hint, |
| 4899 const int encoding_hint, |
| 4900 const Language language_hint, |
| 4901 const CompactEncDet::TextCorpusType corpus_type, |
| 4902 bool ignore_7bit_mail_encodings, |
| 4903 DetectEncodingState* destatep) { |
| 4904 bool enc_is_reliable = destatep->reliable; |
| 4905 Encoding new_enc = enc; |
| 4906 Encoding second_best_enc = |
| 4907 kMapToEncoding[destatep->second_top_rankedencoding]; |
| 4908 |
| 4909 if (FLAGS_counts) {++rescan_used;} |
| 4910 |
| 4911 int scanned_bytes = src - isrc; |
| 4912 int unscanned_bytes = srctextlimit - src; |
| 4913 int text_length = srctextlimit - isrc; |
| 4914 bool empty_rescan = true; |
| 4915 |
| 4916 // See if enough bytes left to bother doing rescan |
| 4917 if (kMinRescanLength < unscanned_bytes) { |
| 4918 const char* text = reinterpret_cast<const char*>(isrc); |
| 4919 |
| 4920 Encoding one_hint = destatep->http_hint; |
| 4921 if ((one_hint == UNKNOWN_ENCODING) && |
| 4922 (destatep->meta_hint != UNKNOWN_ENCODING)) { |
| 4923 one_hint = destatep->meta_hint; |
| 4924 } |
| 4925 if ((one_hint == UNKNOWN_ENCODING) && |
| 4926 (destatep->bom_hint != UNKNOWN_ENCODING)) { |
| 4927 one_hint = destatep->bom_hint; |
| 4928 } |
| 4929 |
| 4930 // Go to an even offset to keep UTF-16 in synch |
| 4931 int middle_offset = (scanned_bytes + (unscanned_bytes / 2)) & ~1; |
| 4932 CHECK(middle_offset <= text_length); |
| 4933 |
| 4934 // Look back a bit for a low byte to synchronize, else hope for the best. |
| 4935 const uint8* srcbacklimit = isrc + middle_offset - kMaxScanBack; |
| 4936 if (srcbacklimit < src) { |
| 4937 srcbacklimit = src; |
| 4938 } |
| 4939 const uint8* ss = isrc + middle_offset - 1; |
| 4940 while (srcbacklimit <= ss) { |
| 4941 if ((*ss & 0x80) == 0) {break;} |
| 4942 --ss; |
| 4943 } |
| 4944 // Leave middle offset unchanged unless we found a low byte |
| 4945 if (srcbacklimit <= ss) { |
| 4946 // Align to low byte or high byte just after it, whichever is even |
| 4947 middle_offset = (ss - isrc + 1) & ~1; // Even to keep UTF-16 in sync |
| 4948 } |
| 4949 CHECK(middle_offset <= text_length); |
| 4950 |
| 4951 if (destatep->debug_data != NULL) { |
| 4952 SetDetailsEncLabel(destatep, ">> Rescan"); |
| 4953 // Print the current chart before recursive call |
| 4954 DumpDetail(destatep); |
| 4955 |
| 4956 char buff[32]; |
| 4957 snprintf(buff, sizeof(buff), ">> Rescan[%d..%d]", |
| 4958 middle_offset, text_length); |
| 4959 PsRecurse(buff); |
| 4960 } |
| 4961 |
| 4962 int mid_bytes_consumed; |
| 4963 bool mid_is_reliable; |
| 4964 Encoding mid_second_best_enc; |
| 4965 CEDInternalFlags newflags = static_cast<CEDInternalFlags>( |
| 4966 kCEDRescanning + kCEDForceTags); |
| 4967 // Recursive call for rescan of half of remaining |
| 4968 Encoding mid_enc = InternalDetectEncoding( |
| 4969 newflags, |
| 4970 text + middle_offset, |
| 4971 text_length - middle_offset, |
| 4972 url_hint, |
| 4973 http_charset_hint, |
| 4974 meta_charset_hint, |
| 4975 encoding_hint, |
| 4976 language_hint, // User interface lang |
| 4977 corpus_type, |
| 4978 ignore_7bit_mail_encodings, |
| 4979 &mid_bytes_consumed, |
| 4980 &mid_is_reliable, |
| 4981 &mid_second_best_enc); |
| 4982 destatep->reliable = mid_is_reliable; |
| 4983 |
| 4984 empty_rescan = (mid_enc == ASCII_7BIT); |
| 4985 |
| 4986 // Not the right decision if, e.g. enc=Greek, mid=ASCII7, one=KSC |
| 4987 // hence the !empty_rescan term |
| 4988 if (!empty_rescan && CompatibleEnc(one_hint, mid_enc)) { |
| 4989 // Encoding we just found is compatible with the |
| 4990 // single hint (if any); return superset |
| 4991 new_enc = SupersetEnc(one_hint, mid_enc); |
| 4992 } |
| 4993 |
| 4994 // If original and mid are compatible, and both reliable, |
| 4995 // return new_enc = SupersetEnc(enc, mid_enc) |
| 4996 // |
| 4997 // This avoids too much weight on a bogus hint causing a RobustScan |
| 4998 // that gets the wrong answer |
| 4999 if (!empty_rescan && mid_is_reliable && enc_is_reliable && |
| 5000 CompatibleEnc(enc, mid_enc)) { |
| 5001 new_enc = SupersetEnc(enc, mid_enc); |
| 5002 return new_enc; |
| 5003 } |
| 5004 |
| 5005 // if mid unreliable, robustscan |
| 5006 // if mid empty, robustscan |
| 5007 // if original and mid not compatible, robustscan |
| 5008 // if mid and one_hint not compatible, robustscan |
| 5009 |
| 5010 // If we found conflicting data, drop back and do a robust scan of a big |
| 5011 // chunk of the input over a set of candidate encodings |
| 5012 // |
| 5013 if (!mid_is_reliable || |
| 5014 empty_rescan || |
| 5015 !CompatibleEnc(enc, mid_enc) || |
| 5016 !CompatibleEnc(one_hint, mid_enc)) { |
| 5017 int robust_renc_list_len; // Number of active encodings |
| 5018 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings |
| 5019 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs |
| 5020 |
| 5021 robust_renc_list_len = 0; |
| 5022 AddToSet(enc, &robust_renc_list_len, robust_renc_list); |
| 5023 AddToSet(second_best_enc, &robust_renc_list_len, robust_renc_list); |
| 5024 AddToSet(mid_enc, &robust_renc_list_len, robust_renc_list); |
| 5025 AddToSet(mid_second_best_enc, &robust_renc_list_len, robust_renc_list); |
| 5026 if (destatep->http_hint != UNKNOWN_ENCODING) { |
| 5027 AddToSet(destatep->http_hint, &robust_renc_list_len, robust_renc_list); |
| 5028 } |
| 5029 if (destatep->meta_hint != UNKNOWN_ENCODING) { |
| 5030 AddToSet(destatep->meta_hint, &robust_renc_list_len, robust_renc_list); |
| 5031 } |
| 5032 if (destatep->bom_hint != UNKNOWN_ENCODING) { |
| 5033 AddToSet(destatep->bom_hint, &robust_renc_list_len, robust_renc_list); |
| 5034 } |
| 5035 if (destatep->tld_hint != UNKNOWN_ENCODING) { |
| 5036 AddToSet(destatep->tld_hint, &robust_renc_list_len, robust_renc_list); |
| 5037 } |
| 5038 |
| 5039 // Separate simple scan |
| 5040 // ===================== |
| 5041 if (destatep->debug_data != NULL) { |
| 5042 SetDetailsEncLabel(destatep, ">> RobustScan"); |
| 5043 // Print the current chart before recursive call |
| 5044 DumpDetail(destatep); |
| 5045 |
| 5046 char buff[32]; |
| 5047 snprintf(buff, sizeof(buff), ">> RobustScan[0..%d]", text_length); |
| 5048 PsRecurse(buff); |
| 5049 } |
| 5050 |
| 5051 int bigram_count = RobustScan(text, text_length, |
| 5052 robust_renc_list_len, robust_renc_list, robust_renc_probs); |
| 5053 |
| 5054 // Default to new_enc and update if something better was found |
| 5055 int best_prob = -1; |
| 5056 // TEMP print |
| 5057 for (int i = 0; i < robust_renc_list_len; ++i) { |
| 5058 if (best_prob < robust_renc_probs[i]) { |
| 5059 best_prob = robust_renc_probs[i]; |
| 5060 new_enc = kMapToEncoding[robust_renc_list[i]]; |
| 5061 } |
| 5062 } |
| 5063 |
| 5064 if (destatep->debug_data != NULL) { |
| 5065 char buff[32]; |
| 5066 snprintf(buff, sizeof(buff), "=Robust[%d] %s", |
| 5067 bigram_count, MyEncodingName(new_enc)); |
| 5068 SetDetailsEncProb(destatep, |
| 5069 0, |
| 5070 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc
), |
| 5071 buff); |
| 5072 } |
| 5073 } |
| 5074 } // End if enough bytes |
| 5075 |
| 5076 return new_enc; |
| 5077 } |
| 5078 |
| 5079 // With no hints at all, and perhaps on rescan, we relax our pickiness |
| 5080 // and go ahead and accept the top multibyte encodings, even though |
| 5081 // strictly their web pages should have declared an explicit encoding to |
| 5082 // avoid the HTML standard's default ISO-8859-1. |
| 5083 bool NoHintsCloseEnoughCompatible(Encoding top_enc) { |
| 5084 // First test accepts degenerate cases plus UTF8 and UTF8UTF8 |
| 5085 if (CompatibleEnc(UTF8, top_enc)) {return true;} |
| 5086 |
| 5087 // The rest look for exact match of base encoding |
| 5088 Encoding base_enc = kMapEncToBaseEncoding[top_enc]; |
| 5089 if (base_enc == JAPANESE_EUC_JP) {return true;} |
| 5090 if (base_enc == JAPANESE_SHIFT_JIS) {return true;} |
| 5091 if (base_enc == CHINESE_BIG5) {return true;} |
| 5092 if (base_enc == CHINESE_GB) {return true;} |
| 5093 if (base_enc == KOREAN_EUC_KR) {return true;} |
| 5094 return false; |
| 5095 } |
| 5096 |
| 5097 |
| 5098 |
| 5099 // Scan raw bytes and detect most likely encoding |
| 5100 // Design goals: |
| 5101 // Skip over big initial stretches of seven-bit ASCII bytes very quickly |
| 5102 // Thread safe |
| 5103 // Works equally well on |
| 5104 // 50-byte queries, |
| 5105 // 5000-byte email and |
| 5106 // 50000-byte web pages |
| 5107 // Length 0 input returns ISO_8859_1 (ASCII) encoding |
| 5108 // Setting ignore_7bit_mail_encodings effectively turns off detection of |
| 5109 // UTF-7, HZ, and ISO-2022-xx |
| 5110 Encoding InternalDetectEncoding( |
| 5111 CEDInternalFlags flags, const char* text, int text_length, |
| 5112 const char* url_hint, const char* http_charset_hint, |
| 5113 const char* meta_charset_hint, const int encoding_hint, |
| 5114 const Language language_hint, // User interface lang |
| 5115 const CompactEncDet::TextCorpusType corpus_type, |
| 5116 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable, |
| 5117 Encoding* second_best_enc) { |
| 5118 *bytes_consumed = 0; |
| 5119 *is_reliable = false; |
| 5120 *second_best_enc = ASCII_7BIT; |
| 5121 |
| 5122 if (text_length == 0) { |
| 5123 // Follow the spec. Text might be NULL. |
| 5124 *is_reliable = true; |
| 5125 return ISO_8859_1; |
| 5126 } |
| 5127 |
| 5128 // For very short (20-50 byte) input strings that are highly likely to be |
| 5129 // all printable ASCII, our startup overhead might dominate. We have to do the |
| 5130 // full detection if the ISO-2022-xx, HZ, or UTF-7 encodings are possible. |
| 5131 // Otherwise, we can do a quick scan for printable ASCII. |
| 5132 if ((text_length <= 500) && ignore_7bit_mail_encodings && |
| 5133 QuickPrintableAsciiScan(text, text_length)) { |
| 5134 *is_reliable = true; |
| 5135 return ASCII_7BIT; |
| 5136 } |
| 5137 |
| 5138 // Go for the full boat detection |
| 5139 DetectEncodingState destate; |
| 5140 InitDetectEncodingState(&destate); |
| 5141 |
| 5142 std::unique_ptr<DetailEntry[]> scoped_debug_data; |
| 5143 if (FLAGS_enc_detect_detail) { |
| 5144 // Allocate max 10 details per bigram |
| 5145 scoped_debug_data.reset(new DetailEntry[kMaxPairs * 10]); |
| 5146 destate.debug_data = scoped_debug_data.get(); |
| 5147 // NOTE: destate and scoped_debug_data have exactly the same scope |
| 5148 // All other FLAGS_enc_detect_detail tests use destate.debug_data != NULL |
| 5149 } |
| 5150 |
| 5151 // Get text length limits |
| 5152 // Typically, we scan the first 16KB looking for all encodings, then |
| 5153 // scan the rest (up to 256KB) a bit faster by no longer looking for |
| 5154 // interesting bytes below 0x80. This allows us to skip over runs of |
| 5155 // 7-bit-ASCII much more quickly. |
| 5156 int slow_len = minint(text_length, (FLAGS_enc_detect_slow_max_kb << 10)); |
| 5157 int fast_len = minint(text_length, (FLAGS_enc_detect_fast_max_kb << 10)); |
| 5158 |
| 5159 // Initialize pointers. |
| 5160 // In general, we do not look at last 3 bytes of input in the fast scan |
| 5161 // We do, however want to look at the last byte or so in the slow scan, |
| 5162 // especilly in the case of a very short text whose only interesting |
| 5163 // information is a 3-byte UTF-8 character in the last three bytes. |
| 5164 // If necessary, we fake a last bigram with 0x20 space as a pad byte. |
| 5165 const uint8* isrc = reinterpret_cast<const uint8*>(text); |
| 5166 const uint8* src = isrc; |
| 5167 const uint8* srctextlimit = isrc + text_length; |
| 5168 const uint8* srclimitslow2 = isrc + slow_len - 1; |
| 5169 const uint8* srclimitfast2 = isrc + fast_len - 1; |
| 5170 const uint8* srclimitfast4 = isrc + fast_len - 3; |
| 5171 if (srclimitslow2 > srclimitfast2) { |
| 5172 srclimitslow2 = srclimitfast2; |
| 5173 } |
| 5174 destate.initial_src = isrc; |
| 5175 destate.limit_src = srclimitfast2 + 1; // May include last byte |
| 5176 destate.prior_src = isrc; |
| 5177 destate.last_pair = isrc - 2; |
| 5178 |
| 5179 const char* scan_table = kTestPrintableAsciiTildePlus; |
| 5180 if (ignore_7bit_mail_encodings) { |
| 5181 // Caller wants to ignore UTF-7, HZ, ISO-2022-xx |
| 5182 // Don't stop on + (for UTF-7), nor on ~ (for HZ) |
| 5183 scan_table = kTestPrintableAscii; |
| 5184 } |
| 5185 int exit_reason = 0; |
| 5186 |
| 5187 if (destate.debug_data != NULL) { |
| 5188 BeginDetail(&destate); |
| 5189 // Take any incoming watch encoding name and backmap to the corresponding |
| 5190 // ranked enum value |
| 5191 watch1_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch1); |
| 5192 if (watch1_rankedenc >= 0) { |
| 5193 fprintf(stderr, "/track-me %d def\n", watch1_rankedenc); |
| 5194 } |
| 5195 |
| 5196 watch2_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch2); |
| 5197 if (watch2_rankedenc >= 0) { |
| 5198 fprintf(stderr, "/track-me2 %d def\n", watch2_rankedenc); |
| 5199 } |
| 5200 |
| 5201 fprintf(stderr, "%% kDerateHintsBelow = %d\n", kDerateHintsBelow); |
| 5202 } |
| 5203 if (FLAGS_enc_detect_source) { |
| 5204 PsSourceInit(kPsSourceWidth); |
| 5205 PsSource(src, isrc, srctextlimit); |
| 5206 PsMark(src, 4, isrc, 0); |
| 5207 } |
| 5208 |
| 5209 // Apply hints, if any, to probabilities |
| 5210 // NOTE: Encoding probabilites are all zero at this point |
| 5211 ApplyHints(url_hint, |
| 5212 http_charset_hint, |
| 5213 meta_charset_hint, |
| 5214 encoding_hint, |
| 5215 language_hint, |
| 5216 corpus_type, |
| 5217 &destate); |
| 5218 |
| 5219 // NOTE: probabilities up to this point are subject to derating for |
| 5220 // small numbers of bigrams. |
| 5221 // Probability changes after this point are not derated. |
| 5222 |
| 5223 // Do first 4 bytes to pick off strong markers |
| 5224 InitialBytesBoost(isrc, text_length, &destate); |
| 5225 |
| 5226 bool ignored_some_tag_text = false; |
| 5227 int tag_text_bigram_count = 0; |
| 5228 |
| 5229 // Slower loop, approx 500 MB/sec (2.8 GHz P4) |
| 5230 // ASSERT(srclimitslow2 <= srclimitfast2); |
| 5231 //==================================== |
| 5232 DoMoreSlowLoop: |
| 5233 while (src < srclimitslow2) { |
| 5234 // Skip to next interesting byte (this is the slower part) |
| 5235 while (src < srclimitslow2) { |
| 5236 uint8 uc = *src++; |
| 5237 if (scan_table[uc] != 0) {exit_reason = scan_table[uc]; src--; break;} |
| 5238 } |
| 5239 |
| 5240 if (src < srclimitslow2) { |
| 5241 if (FLAGS_enc_detect_source) { |
| 5242 PsSource(src, isrc, srctextlimit); // don't mark yet |
| 5243 } |
| 5244 |
| 5245 int weightshift = 0; |
| 5246 // In the first 16KB, derate new text run inside <title>...</title> and |
| 5247 // inside <!-- ... --> |
| 5248 if (////((destate.last_pair + 6) <= src) && // if beyond last
one |
| 5249 ////(tag_text_bigram_count < kMaxBigramsTagTitleText) && |
| 5250 (corpus_type == CompactEncDet::WEB_CORPUS) && // and web page |
| 5251 !CEDFlagForceTags(flags)) { // and OK to skip |
| 5252 ////if (TextInsideTag(destate.last_pair + 2, src, srclimitslow2)) { |
| 5253 if (TextInsideTag(isrc, src, srclimitslow2)) { |
| 5254 if (tag_text_bigram_count >= kMaxBigramsTagTitleText) { |
| 5255 ignored_some_tag_text = true; |
| 5256 src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2); |
| 5257 continue; |
| 5258 } else { |
| 5259 weightshift = kWeightshiftForTagTitleText; |
| 5260 ++tag_text_bigram_count; |
| 5261 } |
| 5262 } |
| 5263 } |
| 5264 if (FLAGS_enc_detect_source) { |
| 5265 PsMark(src, 2, isrc, weightshift); |
| 5266 } |
| 5267 // Saves byte pair and offset |
| 5268 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src, |
| 5269 &destate, weightshift, exit_reason); |
| 5270 // Advance; if inside tag, advance to end of tag |
| 5271 if (weightshift == 0) { |
| 5272 src += exit_reason; // 1 Ascii, 2 other |
| 5273 } else { |
| 5274 src += exit_reason; // 1 Ascii, 2 other |
| 5275 //// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2); |
| 5276 } |
| 5277 |
| 5278 if (pruned) { |
| 5279 // Scoring and active encodings have been updated |
| 5280 if (destate.done) {break;} |
| 5281 // Check if all the reasons for the slow loop have been pruned |
| 5282 // If so, go to fast loop |
| 5283 if (!SevenBitActive(&destate)) {break;} |
| 5284 } |
| 5285 } |
| 5286 } |
| 5287 //==================================== |
| 5288 |
| 5289 // We reached the end of a slow scan, possibly because no more SevenBitActive, |
| 5290 // or possibly are at end of source. |
| 5291 // If we are exactly at the end of the source, make sure we look at the very |
| 5292 // last byte. |
| 5293 bool very_last_byte_incremented = false; |
| 5294 if (src == (srctextlimit - 1)) { |
| 5295 exit_reason = scan_table[*src]; |
| 5296 if (exit_reason != 0) { |
| 5297 // The very last byte is an interesting byte |
| 5298 // Saves byte pair and offset |
| 5299 //printf("Interesting very last slow byte = 0x%02x\n", *src); |
| 5300 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason); |
| 5301 very_last_byte_incremented = true; |
| 5302 } |
| 5303 } |
| 5304 |
| 5305 if (FLAGS_enc_detect_source) { |
| 5306 PsSource(src, isrc, srctextlimit); |
| 5307 PsMark(src, 2, isrc, 0); |
| 5308 } |
| 5309 // Force a pruning based on whatever we have |
| 5310 // Delete the seven-bit encodings if there is no evidence of them so far |
| 5311 BoostPrune(src, &destate, PRUNE_SLOWEND); |
| 5312 |
| 5313 if (!destate.done) { |
| 5314 // If not clear yet on 7-bit-encodings and more bytes, do more slow |
| 5315 if (SevenBitActive(&destate) && (src < srclimitfast2)) { |
| 5316 // Increment limit by another xxxK |
| 5317 slow_len += (FLAGS_enc_detect_slow_max_kb << 10); |
| 5318 srclimitslow2 = isrc + slow_len - 1; |
| 5319 if (srclimitslow2 > srclimitfast2) { |
| 5320 srclimitslow2 = srclimitfast2; |
| 5321 } |
| 5322 if (!UTF7OrHzActive(&destate)) { |
| 5323 // We can switch to table that does not stop on + ~ |
| 5324 scan_table = kTestPrintableAscii; |
| 5325 } |
| 5326 goto DoMoreSlowLoop; |
| 5327 } |
| 5328 |
| 5329 |
| 5330 exit_reason = 2; |
| 5331 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec |
| 5332 //==================================== |
| 5333 while (src < srclimitfast2) { |
| 5334 // Skip to next interesting byte (this is the faster part) |
| 5335 while (src < srclimitfast4) { |
| 5336 uint32 u32 = *reinterpret_cast<const uint32*>(src); |
| 5337 src+= 4; |
| 5338 if ((u32 & 0x80808080) != 0) {src -= 4; break;} |
| 5339 } |
| 5340 while (src < srclimitfast2) { |
| 5341 uint8 uc = *src++; |
| 5342 if (static_cast<signed char>(uc) < 0) {src--; break;} |
| 5343 } |
| 5344 |
| 5345 if (src < srclimitfast2) { |
| 5346 if (FLAGS_enc_detect_source) { |
| 5347 PsSource(src, isrc, srctextlimit); |
| 5348 PsMark(src, 2, isrc, 0); |
| 5349 } |
| 5350 // saves byte pair and offset |
| 5351 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src, |
| 5352 &destate, 0, exit_reason); |
| 5353 src += exit_reason; // 1 Ascii, 2 other |
| 5354 if (pruned) { |
| 5355 // Scoring and active encodings have been updated |
| 5356 if (destate.done) {break;} |
| 5357 } |
| 5358 } |
| 5359 } |
| 5360 //==================================== |
| 5361 // We reached the end of fast scan |
| 5362 |
| 5363 // If we are exactly at the end of the source, make sure we look at the very |
| 5364 // last byte. |
| 5365 if (src == (srctextlimit - 1) && !very_last_byte_incremented) { |
| 5366 exit_reason = scan_table[*src]; |
| 5367 if (exit_reason != 0) { |
| 5368 // The very last byte is an interesting byte |
| 5369 // Saves byte pair and offset |
| 5370 //printf("Interesting very last fast byte = 0x%02x\n", *src); |
| 5371 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason
); |
| 5372 very_last_byte_incremented = true; |
| 5373 } |
| 5374 } |
| 5375 |
| 5376 } // End if !done |
| 5377 |
| 5378 if (FLAGS_enc_detect_source) { |
| 5379 PsSource(src, isrc, srctextlimit); |
| 5380 PsMark(src, 2, isrc, 0); |
| 5381 } |
| 5382 // Force a pruning based on whatever we have |
| 5383 BoostPrune(src, &destate, PRUNE_FINAL); |
| 5384 |
| 5385 if (FLAGS_enc_detect_summary) { |
| 5386 DumpSummary(&destate, AsciiPair, 32); |
| 5387 DumpSummary(&destate, OtherPair, 32); |
| 5388 } |
| 5389 if (FLAGS_enc_detect_source) { |
| 5390 PsSourceFinish(); |
| 5391 } |
| 5392 if (destate.debug_data != NULL) { |
| 5393 //// DumpDetail(&destate); |
| 5394 } |
| 5395 |
| 5396 |
| 5397 if (ignored_some_tag_text && |
| 5398 (kMapToEncoding[destate.top_rankedencoding] == ASCII_7BIT)) { |
| 5399 // There were some interesting bytes, but only in tag text. |
| 5400 // Recursive call to reprocess looking at the tags this time. |
| 5401 |
| 5402 if (destate.debug_data != NULL) { |
| 5403 SetDetailsEncLabel(&destate, ">> Recurse/tags"); |
| 5404 // Print the current chart before recursive call |
| 5405 DumpDetail(&destate); |
| 5406 |
| 5407 char buff[32]; |
| 5408 snprintf(buff, sizeof(buff), ">> Recurse for tags"); |
| 5409 PsRecurse(buff); |
| 5410 } |
| 5411 |
| 5412 // Recursive call for high bytes in tags [no longer used, 1/16 tag score] |
| 5413 Encoding enc2 = InternalDetectEncoding( |
| 5414 kCEDForceTags, // force |
| 5415 text, |
| 5416 text_length, |
| 5417 url_hint, |
| 5418 http_charset_hint, |
| 5419 meta_charset_hint, |
| 5420 encoding_hint, |
| 5421 language_hint, |
| 5422 corpus_type, |
| 5423 ignore_7bit_mail_encodings, |
| 5424 bytes_consumed, |
| 5425 is_reliable, |
| 5426 second_best_enc); |
| 5427 |
| 5428 if (destate.debug_data != NULL) { |
| 5429 // Show winning encoding and dump PostScript |
| 5430 char buff[32]; |
| 5431 snprintf(buff, sizeof(buff), "=2 %s", MyEncodingName(enc2)); |
| 5432 SetDetailsEncProb(&destate, |
| 5433 0, |
| 5434 CompactEncDet::BackmapEncodingToRankedEncoding(enc2), |
| 5435 buff); |
| 5436 DumpDetail(&destate); |
| 5437 } |
| 5438 |
| 5439 return enc2; |
| 5440 } |
| 5441 |
| 5442 |
| 5443 // If the detected encoding does not match default/hints, or if the hints |
| 5444 // conflict with each other, mark as unreliable. This can be used to trigger |
| 5445 // further scoring. |
| 5446 // Three buckets of input documents; |
| 5447 // ~19% of the web no hints, and top == 7bit, Latin1, or CP1252 |
| 5448 // ~79% of the web one or more hints, all same encoding X and top == X |
| 5449 // ~ 2% of the web one or more hints that are inconsistent |
| 5450 |
| 5451 Encoding top_enc = kMapToEncoding[destate.top_rankedencoding]; |
| 5452 Encoding one_hint = destate.http_hint; |
| 5453 if ((one_hint == UNKNOWN_ENCODING) && |
| 5454 (destate.meta_hint != UNKNOWN_ENCODING)) { |
| 5455 one_hint = destate.meta_hint; |
| 5456 } |
| 5457 if ((one_hint == UNKNOWN_ENCODING) && |
| 5458 (destate.bom_hint != UNKNOWN_ENCODING)) { |
| 5459 one_hint = destate.bom_hint; |
| 5460 } |
| 5461 |
| 5462 bool found_compatible_encoding = true; |
| 5463 if (one_hint == UNKNOWN_ENCODING) { |
| 5464 // [~14% of the web] No hints, and top == 7bit, Latin1, or CP1252 |
| 5465 if (!CompatibleEnc(ISO_8859_1, top_enc)) { |
| 5466 found_compatible_encoding = false; |
| 5467 // If there is nothing but a TLD hint and its top encoding matches, OK |
| 5468 if ((destate.tld_hint != UNKNOWN_ENCODING) && |
| 5469 CompatibleEnc(destate.tld_hint, top_enc)) { |
| 5470 found_compatible_encoding = true; |
| 5471 } |
| 5472 } |
| 5473 } else if (CompatibleEnc(one_hint, destate.http_hint) && |
| 5474 CompatibleEnc(one_hint, destate.meta_hint) && |
| 5475 CompatibleEnc(one_hint, destate.bom_hint)) { |
| 5476 // [~83% of the web] One or more hints, all same encoding X and top == X |
| 5477 if (!CompatibleEnc(one_hint, top_enc)) { |
| 5478 // [~ 2% of the web] Oops, not the declared encoding |
| 5479 found_compatible_encoding = false; |
| 5480 } |
| 5481 } else { |
| 5482 // [~ 3% of the web] Two or more hints that are inconsistent |
| 5483 one_hint = UNKNOWN_ENCODING; |
| 5484 found_compatible_encoding = false; |
| 5485 } |
| 5486 |
| 5487 // If we turned Latin1 into Latin2 or 7 via trigrams, don't fail it here |
| 5488 if (destate.do_latin_trigrams) { |
| 5489 if (CompatibleEnc(kMapToEncoding[F_Latin1], top_enc) || |
| 5490 CompatibleEnc(kMapToEncoding[F_Latin2], top_enc) || |
| 5491 CompatibleEnc(kMapToEncoding[F_CP1250], top_enc) || |
| 5492 CompatibleEnc(kMapToEncoding[F_ISO_8859_13], top_enc)) { |
| 5493 found_compatible_encoding = true; |
| 5494 destate.reliable = true; |
| 5495 } |
| 5496 } |
| 5497 |
| 5498 // If top encoding is not compatible with the hints, but it is reliably |
| 5499 // UTF-8, accept it anyway. |
| 5500 // This will perform badly with mixed UTF-8 prefix plus another encoding in |
| 5501 // the body if done too early, so we want to be rescanning. |
| 5502 if (!found_compatible_encoding && |
| 5503 destate.reliable && |
| 5504 NoHintsCloseEnoughCompatible(top_enc) && |
| 5505 (destate.next_interesting_pair[OtherPair] >= kStrongPairs) && |
| 5506 CEDFlagRescanning(flags)) { |
| 5507 found_compatible_encoding = true; |
| 5508 } |
| 5509 |
| 5510 // Hold off on this so Rescan() can see if the original encoding was reliable |
| 5511 //if (!found_compatible_encoding) { |
| 5512 // destate.reliable = false; |
| 5513 //} |
| 5514 |
| 5515 // If unreliable, try rescoring to separate some encodings |
| 5516 if (!destate.reliable || !found_compatible_encoding) { |
| 5517 top_enc = Rescore(top_enc, isrc, srctextlimit, &destate); |
| 5518 } |
| 5519 |
| 5520 *second_best_enc = kMapToEncoding[destate.second_top_rankedencoding]; |
| 5521 |
| 5522 // If unreliable, and not already rescanning, |
| 5523 // rescan middle of document to see if we can get a better |
| 5524 // answer. Rescan is only worthwhile if there are ~200 bytes or more left, |
| 5525 // since the detector takes as much as 96 bytes of bigrams to decide. |
| 5526 // |
| 5527 // CANNOT retry ISO-2022-xx HZ etc. because no declaration escape at the front |
| 5528 // or we may land in the middle of some partial state. Skip them all. |
| 5529 // |
| 5530 if ((!destate.reliable || !found_compatible_encoding) && |
| 5531 !CEDFlagRescanning(flags) && |
| 5532 !SevenBitEncoding(top_enc)) { |
| 5533 top_enc = Rescan(top_enc, |
| 5534 isrc, |
| 5535 src, |
| 5536 srctextlimit, |
| 5537 url_hint, |
| 5538 http_charset_hint, |
| 5539 meta_charset_hint, |
| 5540 encoding_hint, |
| 5541 language_hint, |
| 5542 corpus_type, |
| 5543 ignore_7bit_mail_encodings, |
| 5544 &destate); |
| 5545 } else { |
| 5546 if (!found_compatible_encoding) { |
| 5547 destate.reliable = false; |
| 5548 } |
| 5549 } |
| 5550 |
| 5551 if (destate.debug_data != NULL) { |
| 5552 // Dump PostScript |
| 5553 DumpDetail(&destate); |
| 5554 } |
| 5555 |
| 5556 *bytes_consumed = src - isrc + 1; // We looked 1 byte beyond src |
| 5557 *is_reliable = destate.reliable; |
| 5558 return top_enc; |
| 5559 } |
| 5560 |
| 5561 Encoding CompactEncDet::DetectEncoding( |
| 5562 const char* text, int text_length, const char* url_hint, |
| 5563 const char* http_charset_hint, const char* meta_charset_hint, |
| 5564 const int encoding_hint, |
| 5565 const Language language_hint, // User interface lang |
| 5566 const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, |
| 5567 int* bytes_consumed, bool* is_reliable) { |
| 5568 if (FLAGS_ced_echo_input) { |
| 5569 string temp(text, text_length); |
| 5570 fprintf(stderr, "CompactEncDet::DetectEncoding()\n%s\n\n", temp.c_str()); |
| 5571 } |
| 5572 |
| 5573 if (FLAGS_counts) { |
| 5574 encdet_used = 0; |
| 5575 rescore_used = 0; |
| 5576 rescan_used = 0; |
| 5577 robust_used = 0; |
| 5578 looking_used = 0; |
| 5579 doing_used = 0; |
| 5580 ++encdet_used; |
| 5581 } |
| 5582 if (FLAGS_dirtsimple) { |
| 5583 // Just count first 64KB bigram encoding probabilities for each encoding |
| 5584 int robust_renc_list_len; // Number of active encodings |
| 5585 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings |
| 5586 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs |
| 5587 |
| 5588 for (int i = 0; i < NUM_RANKEDENCODING; ++i) { |
| 5589 robust_renc_list[i] = i; |
| 5590 } |
| 5591 robust_renc_list_len = NUM_RANKEDENCODING; |
| 5592 |
| 5593 RobustScan(text, text_length, |
| 5594 robust_renc_list_len, robust_renc_list, robust_renc_probs); |
| 5595 |
| 5596 // Pick off best encoding |
| 5597 int best_prob = -1; |
| 5598 Encoding enc = UNKNOWN_ENCODING; |
| 5599 for (int i = 0; i < robust_renc_list_len; ++i) { |
| 5600 if (best_prob < robust_renc_probs[i]) { |
| 5601 best_prob = robust_renc_probs[i]; |
| 5602 enc = kMapToEncoding[robust_renc_list[i]]; |
| 5603 } |
| 5604 } |
| 5605 |
| 5606 *bytes_consumed = minint(text_length, (kMaxKBToRobustScan << 10)); |
| 5607 *is_reliable = true; |
| 5608 if (FLAGS_counts) { |
| 5609 printf("CEDcounts "); |
| 5610 while (encdet_used--) {printf("encdet ");} |
| 5611 while (rescore_used--) {printf("rescore ");} |
| 5612 while (rescan_used--) {printf("rescan ");} |
| 5613 while (robust_used--) {printf("robust ");} |
| 5614 while (looking_used--) {printf("looking ");} |
| 5615 while (doing_used--) {printf("doing ");} |
| 5616 printf("\n"); |
| 5617 } |
| 5618 |
| 5619 return enc; |
| 5620 } |
| 5621 |
| 5622 Encoding second_best_enc; |
| 5623 Encoding enc = InternalDetectEncoding(kCEDNone, |
| 5624 text, |
| 5625 text_length, |
| 5626 url_hint, |
| 5627 http_charset_hint, |
| 5628 meta_charset_hint, |
| 5629 encoding_hint, |
| 5630 language_hint, // User interface lang |
| 5631 corpus_type, |
| 5632 ignore_7bit_mail_encodings, |
| 5633 bytes_consumed, |
| 5634 is_reliable, |
| 5635 &second_best_enc); |
| 5636 if (FLAGS_counts) { |
| 5637 printf("CEDcounts "); |
| 5638 while (encdet_used--) {printf("encdet ");} |
| 5639 while (rescore_used--) {printf("rescore ");} |
| 5640 while (rescan_used--) {printf("rescan ");} |
| 5641 while (robust_used--) {printf("robust ");} |
| 5642 while (looking_used--) {printf("looking ");} |
| 5643 while (doing_used--) {printf("doing ");} |
| 5644 printf("\n"); |
| 5645 } |
| 5646 return enc; |
| 5647 } |
| 5648 |
| 5649 |
| 5650 // Return top encoding hint for given string |
| 5651 Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) { |
| 5652 string normalized_lang = MakeChar8(string(name)); |
| 5653 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize, |
| 5654 normalized_lang.c_str()); |
| 5655 if (n < 0) {return UNKNOWN_ENCODING;} |
| 5656 |
| 5657 // Charset is eight bytes, probability table is eight bytes |
| 5658 int toprankenc = |
| 5659 TopCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey], |
| 5660 kMaxLangVector); |
| 5661 return kMapToEncoding[toprankenc]; |
| 5662 } |
| 5663 |
| 5664 // Return top encoding hint for given string |
| 5665 Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) { |
| 5666 string normalized_tld = MakeChar4(string(name)); |
| 5667 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize, |
| 5668 normalized_tld.c_str()); |
| 5669 if (n < 0) {return UNKNOWN_ENCODING;} |
| 5670 |
| 5671 // TLD is four bytes, probability table is 12 bytes |
| 5672 int toprankenc = |
| 5673 TopCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey], |
| 5674 kMaxTldVector); |
| 5675 return kMapToEncoding[toprankenc]; |
| 5676 } |
| 5677 |
| 5678 // Return top encoding hint for given string |
| 5679 Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) { |
| 5680 string normalized_charset = MakeChar44(string(name)); |
| 5681 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize, |
| 5682 normalized_charset.c_str()); |
| 5683 if (n < 0) {return UNKNOWN_ENCODING;} |
| 5684 |
| 5685 // Charset is eight bytes, probability table is eight bytes |
| 5686 int toprankenc = |
| 5687 TopCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey], |
| 5688 kMaxCharsetVector); |
| 5689 return kMapToEncoding[toprankenc]; |
| 5690 } |
| 5691 |
| 5692 const char* CompactEncDet::Version(void) { |
| 5693 return kVersion; |
| 5694 } |
OLD | NEW |