third_party/cld/encodings/compact_enc_det/compact_enc_det.cc - Issue 1956183002: CL for perf tryjob on linux

Side by Side Diff: third_party/cld/encodings/compact_enc_det/compact_enc_det.cc

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/cld/encodings/compact_enc_det/compact_enc_det.h ('k') | third_party/cld/encodings/compact_enc_det/compact_enc_det.swig » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 //

	2 // Copyright 2006, 2007 Google Inc. All Rights Reserved.

	3 // Author: dsites@google.com (Dick Sites)

	4 //

	5 // Design document: eng/designdocs/i18n/compact_encoding_detector.pdf

	6

	7 #include "encodings/compact_enc_det/compact_enc_det.h"

	8

	9 #include <math.h> // for sqrt

	10 #include <stddef.h> // for size_t

	11 #include <stdio.h> // for printf, fprintf, NULL, etc

	12 #include <stdlib.h> // for qsort

	13 #include <string.h> // for memset, memcpy, memcmp, etc

	14 #include <memory>

	15 #include <string> // for string, operator==, etc

	16

	17 //#include "base/basictypes.h" // for uint8, uint32, char32, etc

	18 //#include "base/commandlineflags.h" // for DEFINE_bool, <anonymous>, etc

	19 //#include "base/logging.h" // for COMPACT_GOOGLE_LOG_FATAL, etc

	20 //#include "base/macros.h" // for COMPILE_ASSERT, arraysize, etc

	21 #include "encodings/compact_enc_det/compact_enc_det_hint_code.h"

	22 #include "encodings/compact_lang_det/win/cld_basictypes.h"

	23 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"

	24 #include "encodings/compact_lang_det/win/cld_logging.h"

	25 #include "encodings/compact_lang_det/win/cld_macros.h"

	26

	27 using std::string;

	28

	29 // TODO

	30 // dsites 2007.10.09

	31 //

	32 // Consider font=TT-BHxxx as user-defined => binary

	33 // Demote GB18030 if no 8x3x pair

	34 // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires

	35 // Consider removing/ignoring bytes 01-1F to avoid crap pollution

	36 // Possibly boost declared encoding in robust scan

	37 // googlebot tiny files

	38 // look for ranges of encodings

	39 // consider tags just as > < within aligned block of 32

	40 // flag too few characters in postproc (Latin 6 problem)

	41 // Remove slow scan beyond 16KB

	42 // Consider removing kMostLikelyEncoding or cut it in half

	43

	44

	45 // A note on mixed encodings

	46 //

	47 // The most common encoding error on the web is a page containing a mixture of

	48 // CP-1252 and UTF-8. A less common encoding error is a third-party feed that

	49 // has been converted from CP-1252 to UTF-8 and then those bytes converted a

	50 // second time to UTF-8. CED originally attempted to detect these error cases

	51 // by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended

	52 // implementation was to start these just below CP1252 and UTF8 respectively in

	53 // overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are

	54 // found.

	55 //

	56 // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the

	57 // UTF8CP1252 internal encoding was added late and not put into encodings.proto,

	58 // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and

	59 // is removed in this November 2011 CL.

	60 //

	61 // Mixed encoding detection never worked out as well as envisioned, so the

	62 // ced_allow_utf8utf8 flag normally disables all this.

	63 //

	64 // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as

	65 // UTF8, and the inputconverter code for UTF8 normally will convert bare

	66 // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8

	67 // and double-UTF-8 mixtures will be detected as UTF-8, and the double

	68 // conversion will stand.

	69 //

	70 // However, it is occasionally useful to use CED to detect double-converted

	71 // UTF-8 coming from third-party data feeds, so they can be fixed at the source.

	72 // For this purpose, the UTF8UTF8 encoding remains available under the

	73 // ced_allow_utf8utf8 flag.

	74 //

	75 // When UTF8UTF8 is detected, the inputconverter code will undo the double

	76 // conversion, giving good text.

	77

	78 // Norbert Runge has noted these words in CP1252 that are mistakenly identified

	79 // as UTF-8 because of the last pair of characters:

	80 // NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH

	81 // drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N

	82 // Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA

	83 // Schoß\u201c 0xDF 0x93 U+00DF U+201C

	84 // weiß\u201c 0xDF 0x93 U+00DF U+00AB

	85 // Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C

	86 // süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE

	87 // These four byte combinations now explicitly boost Latin1/CP1252.

	88

	89 // And for reference, here are a couple of Portuguese spellings

	90 // that may be mistaken as double-byte encodings.

	91 // informações 0xE7 0xF5

	92 // traição 0xE7 0xE3

	93

	94

	95 static const char* kVersion = "2.2";

	96

	97 DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "

	98 "to handle mixtures of CP1252 "

	99 "converted to UTF-8 zero, one, "

	100 "or two times");

	101 DEFINE_int32(enc_detect_slow_max_kb, 16,

	102 "Maximum number of Kbytes to examine for "

	103 "7-bit-only (2022, Hz, UTF7) encoding detect. "

	104 "You are unlikely to want to change this.");

	105 DEFINE_int32(enc_detect_fast_max_kb, 256,

	106 "Maximum number of Kbytes to examine for encoding detect. "

	107 "You are unlikely to want to change this.");

	108

	109 DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "

	110 "difference 1st - 2nd to be considered reliable \n"

	111 " 2 corresponds to min 4x difference\n"

	112 " 4 corresponds to min 16x difference\n"

	113 " 8 corresponds to min 256x difference\n"

	114 " 10 corresponds to min 1024x difference\n"

	115 " 20 corresponds to min 1Mx difference.");

	116

	117 // Text debug output options

	118 DEFINE_bool(enc_detect_summary, false,

	119 "Print first 16 interesting pairs at exit.");

	120 DEFINE_bool(counts, false, "Count major-section usage");

	121

	122 // PostScript debug output options

	123 DEFINE_bool(enc_detect_detail, false,

	124 "Print PostScript of every update, to stderr.");

	125 DEFINE_bool(enc_detect_detail2, false,

	126 "More PostScript detail of every update, to stderr.");

	127 DEFINE_bool(enc_detect_source, false, "Include source text in detail");

	128 // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in

	129 // lang_enc.cc

	130 DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");

	131 DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");

	132

	133

	134 // Only for experiments. Delete soon.

	135 DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");

	136

	137 // Demo-mode/debugging experiment

	138 DEFINE_bool(demo_nodefault, false,

	139 "Default to all equal; no boost for declared encoding.");

	140 DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");

	141 DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");

	142

	143

	144 static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10

	145 static const int XLOG2 = 30; // Multiplier for log base 2 ** n

	146

	147 static const int kFinalPruneDifference = 10 * XLOG2;

	148 // Final bits of minimum

	149 // probability difference 1st-nth

	150 // to be pruned

	151

	152 static const int kInititalPruneDifference = kFinalPruneDifference * 4;

	153 // Initial bits of minimum

	154 // probability difference 1st-nth

	155 // to be pruned

	156 //

	157 static const int kPruneDiffDecrement = kFinalPruneDifference;

	158 // Decrements bits of minimum

	159 // probability difference 1st-nth

	160 // to be pruned

	161

	162 static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum

	163 // probability difference, base to

	164 // superset encodings

	165

	166 static const int kBoostInitial = 20 * XLOG2; // bits of boost for

	167 // initial byte patterns (BOM, 00)

	168

	169 static const int kBadPairWhack = 20 * XLOG2; // bits of whack for

	170 // one bad pair

	171

	172 static const int kBoostOnePair = 20 * XLOG2; // bits of boost for

	173 // one good pair in Hz, etc.

	174

	175 static const int kGentleOnePair = 4 * XLOG2; // bits of boost for

	176 // one good sequence

	177 //

	178 static const int kGentlePairWhack = 2 * XLOG2; // bits of whack

	179 // for ill-formed sequence

	180

	181 static const int kGentlePairBoost = 2 * XLOG2; // bits of boost

	182 // for well-formed sequence

	183

	184 static const int kBoostPerB64Byte = 2 * XLOG2; // bits of boost for

	185 // one good pair in Hz, etc.

	186

	187 static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for

	188 // best declared encoding per bigram

	189

	190 static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for

	191 // best encoding per bigram

	192

	193 static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri

	194

	195 static const int kMaxPairs = 48; // Max interesting pairs to look at

	196 // If you change this,

	197 // adjust PruneDiff

	198

	199 static const int kPruneMask = 0x07; // Prune every 8 interesting pairs

	200

	201

	202 static const int kBestPairsCount = 16; // For first N pairs, do extra boost

	203 // based on most likely encoding

	204 // of pair over entire web

	205

	206 static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams,

	207 // weaken the hints enough that

	208 // unhinted encodings have a hope of

	209 // rising to the top

	210

	211 static const int kMinRescanLength = 800; // Don't bother rescanning for

	212 // unreliable encoding if fewer

	213 // than this many bytes unscanned.

	214 // We will rescan at most last half

	215 // of this.

	216

	217 static const int kStrongBinary = 12; // Make F_BINARY the only encoding

	218 static const int kWeakerBinary = 4; // Make F_BINARY likely encoding

	219

	220 // These are byte counts from front of file

	221 static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII

	222 static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII

	223

	224 // We try here to avoid having title text dominate the encoding detection,

	225 // for the not-infrequent error case of title in encoding1, body in encoding2:

	226 // we want to bias toward encoding2 winning.

	227 //

	228 // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we

	229 // rarely cut off mid-character in the original (not-yet-detected) encoding.

	230 // This matters most for UTF-8 two- and three-byte codes and for

	231 // Shift-JIS three-byte codes.

	232 static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text

	233 static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc.

	234 // 1/16 normal weight

	235

	236 static const int kStrongPairs = 6; // Let reliable enc with this many

	237 // pairs overcome missing hint

	238

	239 enum CEDInternalFlags {

	240 kCEDNone = 0, // The empty flag

	241 kCEDRescanning = 1, // Do not further recurse

	242 kCEDSlowscore = 2, // Do extra scoring

	243 kCEDForceTags = 4, // Always examine text inside tags

	244 };

	245

	246 // Forward declaration

	247 Encoding InternalDetectEncoding(

	248 CEDInternalFlags flags, const char* text, int text_length,

	249 const char* url_hint, const char* http_charset_hint,

	250 const char* meta_charset_hint, const int encoding_hint,

	251 const Language language_hint, // User interface lang

	252 const CompactEncDet::TextCorpusType corpus_type,

	253 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,

	254 Encoding* second_best_enc);

	255

	256 typedef struct {

	257 const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas

	258 uint8 x_bar; // Average byte2 value

	259 uint8 y_bar; // Average byte1 value

	260 uint8 x_stddev; // Standard deviation of byte2 value

	261 uint8 y_stddev; // Standard deviation of byte1 value

	262 int so; // Scaling offset -- add to probabilities below

	263 const uint8 b1[256]; // Unigram probability for first byte of aligned bigram

	264 const uint8 b2[256]; // Unigram probability for second byte of aligned bigram

	265 const uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram

	266 } UnigramEntry;

	267

	268 //typedef struct {

	269 // uint8 b12[256*256]; // Bigram probability for aligned bigram

	270 //} FullBigramEntry;

	271

	272

	273 // Include all the postproc-generated tables here:

	274 // RankedEncoding

	275 // kMapToEncoding

	276 // unigram_table

	277 // kMostLIkelyEncoding

	278 // kTLDHintProbs

	279 // kCharsetHintProbs

	280 // HintEntry, kMaxTldKey kMaxTldVector, etc.

	281 // =============================================================================

	282

	283 #include "encodings/compact_enc_det/compact_enc_det_generated_tables.h"

	284

	285

	286 #define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1"

	287

	288 #define F_BINARY F_X_BINARYENC // We are mid-update for name change

	289 #define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change

	290 #define F_BIG5_CP950 F_BIG5 // We are mid-update for name change

	291 #define F_Unicode F_UTF_16LE // We are mid-update for name change

	292 // =============================================================================

	293

	294 // 7-bit encodings have at least one "interesting" byte value < 0x80

	295 // (00 0E 1B + ~)

	296 // JIS 2022-cn 2022-kr hz utf7

	297 // Unicode UTF-16 UTF-32

	298 // 8-bit encodings have no interesting byte values < 0x80

	299 static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect

	300 static const uint32 kUTF7Active = 0x00000002; // <80 and +

	301 static const uint32 kHzActive = 0x00000004; // <80 and ~

	302 static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F

	303 static const uint32 kUTF8Active = 0x00000010;

	304 static const uint32 kUTF8UTF8Active = 0x00000020;

	305 static const uint32 kUTF1632Active = 0x00000040; // <80 and 00

	306 static const uint32 kBinaryActive = 0x00000080; // <80 and 00

	307 static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx

	308 static const uint32 kIsIndicCode = 0x00000200; //

	309 static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx

	310 static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx

	311 static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase

	312

	313

	314 // Debug only. not thread safe

	315 static int encdet_used = 0;

	316 static int rescore_used = 0;

	317 static int rescan_used = 0;

	318 static int robust_used = 0;

	319 static int looking_used = 0;

	320 static int doing_used = 0;

	321

	322

	323 // For debugging only -- about 256B/entry times about 500 = 128KB

	324 // TODO: only allocate this if being used

	325 typedef struct {

	326 int offset;

	327 int best_enc; // Best ranked encoding for this bigram, or

	328 // -1 for overhead entries

	329 string label;

	330 int detail_enc_prob[NUM_RANKEDENCODING];

	331 } DetailEntry;

	332

	333 static int watch1_rankedenc = -1; // Debug. not threadsafe

	334 static int watch2_rankedenc = -1; // Debug. not threadsafe

	335 ////static int next_detail_entry = 0; // Debug. not threadsafe

	336 ////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram

	337 // End For debugging only

	338

	339 // Must match kTestPrintableAsciiTildePlus exit codes, minus one

	340 enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2};

	341

	342 // The reasons for pruning

	343 enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL};

	344

	345 static const char* kWhatSetName[] = {"Ascii", "Other"};

	346

	347

	348 // State for encodings that do shift-out/shift-in between one- and two-byte

	349 // regions (ISO-2022-xx, HZ)

	350 enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE};

	351

	352 typedef struct {

	353 const uint8* initial_src; // For calculating byte offsets

	354 const uint8* limit_src; // Range of input source

	355 const uint8* prior_src; // Source consumed by prior call to BoostPrune

	356 const uint8* last_pair; // Last pair inserted into interesting_pairs

	357

	358 DetailEntry* debug_data; // Normally NULL. Ptr to debug data for

	359 // FLAGS_enc_detect_detail PostScript data

	360 int next_detail_entry; // Debug

	361

	362 bool done;

	363 bool reliable;

	364 bool hints_derated;

	365 int declared_enc_1; // From http/meta hint

	366 int declared_enc_2; // from http/meta hint

	367 int prune_count; // Number of times we have pruned

	368

	369 int trigram_highwater_mark; // Byte offset of last trigram processing

	370 bool looking_for_latin_trigrams; // True if we should test for doing

	371 // Latin1/2/7 trigram processing

	372 bool do_latin_trigrams; // True if we actually are scoring trigrams

	373

	374 // Miscellaneous state variables for difficult encodings

	375 int binary_quadrants_count; // Number of four bigram quadrants seen:

	376 // 0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx

	377 // 1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx

	378 int binary_8x4_count; // Number of 8x4 buckets seen:

	379 uint32 binary_quadrants_seen; // Bit[i] set if bigram i.......i....... seen

	380 uint32 binary_8x4_seen; // Bit[i] set if bigram iii.....ii...... seen

	381 int utf7_starts; // Count of possible UTF-7 beginnings seen

	382 int prior_utf7_offset; // Source consumed by prior UTF-7 string

	383 int next_utf8_ministate; // Mini state for UTF-8 sequences

	384 int utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors

	385 int next_utf8utf8_ministate; // Mini state for UTF8UTF8 sequences

	386 int utf8utf8_odd_byte; // UTF8UTF8 seq has odd number of bytes

	387 int utf8utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors

	388 StateSoSi next_2022_state; // Mini state for 2022 sequences

	389 StateSoSi next_hz_state; // Mini state for HZ sequences

	390 bool next_eucjp_oddphase; // Mini state for EUC-JP sequences

	391 int byte32_count[8]; // Count of top 3 bits of byte1 of bigram

	392 // 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx

	393 uint32 active_special; // Bits showing which special cases are active

	394

	395 Encoding tld_hint; // Top TLD encoding or UNKNOWN

	396 Encoding http_hint; // What the document says about itself or

	397 Encoding meta_hint; // UNKNOWN_ENCODING. BOM is initial byte

	398 Encoding bom_hint; // order mark for UTF-xx

	399

	400 // small cache of previous interesting bigrams

	401 int next_prior_bigram;

	402 int prior_bigram[4];

	403 int prior_binary[1];

	404

	405 int top_rankedencoding; // Top two probabilities and families

	406 int second_top_rankedencoding;

	407 int top_prob;

	408 int second_top_prob;

	409 int prune_difference; // Prune things this much below the top prob

	410 int rankedencoding_list_len; // Number of active encodings

	411 int rankedencoding_list[NUM_RANKEDENCODING]; // List of active encodings

	412 //

	413 int enc_prob[NUM_RANKEDENCODING]; // Cumulative probability per enc

	414 // This is where all the action is

	415 int hint_prob[NUM_RANKEDENCODING]; // Initial hint probabilities

	416 int hint_weight[NUM_RANKEDENCODING]; // Number of hints for this enc

	417

	418 // Two sets -- one for printable ASCII, one for the rest

	419 int prior_interesting_pair[NUM_PAIR_SETS]; // Pairs consumed by prior call

	420 int next_interesting_pair[NUM_PAIR_SETS]; // Next pair to write

	421 char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2]; // Two bytes per pair

	422 int interesting_offsets[NUM_PAIR_SETS][kMaxPairs]; // Src offset of pair

	423 int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs]; // weightshift of pair

	424 } DetectEncodingState;

	425

	426

	427 // Record a debug event that changes probabilities

	428 void SetDetailsEncProb(DetectEncodingState* destatep,

	429 int offset, int best_enc, const char* label) {

	430 int next = destatep->next_detail_entry;

	431 destatep->debug_data[next].offset = offset;

	432 destatep->debug_data[next].best_enc = best_enc;

	433 destatep->debug_data[next].label = label;

	434 memcpy(&destatep->debug_data[next].detail_enc_prob,

	435 &destatep->enc_prob,

	436 sizeof(destatep->enc_prob));

	437 ++destatep->next_detail_entry;

	438 }

	439

	440 // Record a debug event that changes probabilities, copy offset

	441 void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep,

	442 int best_enc, const char* label) {

	443 int next = destatep->next_detail_entry;

	444 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;

	445 destatep->debug_data[next].best_enc = best_enc;

	446 destatep->debug_data[next].label = label;

	447 memcpy(&destatep->debug_data[next].detail_enc_prob,

	448 &destatep->enc_prob,

	449 sizeof(destatep->enc_prob));

	450 ++destatep->next_detail_entry;

	451 }

	452

	453 // Record a debug event that changes probs and has simple text label

	454 void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) {

	455 int next = destatep->next_detail_entry;

	456 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;

	457 destatep->debug_data[next].best_enc = -1;

	458 destatep->debug_data[next].label = label;

	459 memcpy(&destatep->debug_data[next].detail_enc_prob,

	460 &destatep->enc_prob,

	461 sizeof(destatep->enc_prob));

	462 ++destatep->next_detail_entry;

	463 }

	464

	465 // Record a debug event that is just a text label, no change in probs

	466 void SetDetailsLabel(DetectEncodingState* destatep, const char* label) {

	467 int next = destatep->next_detail_entry;

	468 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;

	469 destatep->debug_data[next].best_enc = -1;

	470 destatep->debug_data[next].label = label;

	471 memcpy(&destatep->debug_data[next].detail_enc_prob,

	472 &destatep->debug_data[next - 1].detail_enc_prob,

	473 sizeof(destatep->enc_prob));

	474 ++destatep->next_detail_entry;

	475 }

	476

	477

	478 // Maps superset encodings to base, to see if 2 encodings are compatible

	479 // (Non-identity mappings are marked "-->" below.)

	480 static const Encoding kMapEncToBaseEncoding[] = {

	481 ISO_8859_1, // 0: Teragram ASCII

	482 ISO_8859_2, // 1: Teragram Latin2

	483 ISO_8859_3, // 2: in BasisTech but not in Teragram

	484 ISO_8859_4, // 3: Teragram Latin4

	485 ISO_8859_5, // 4: Teragram ISO-8859-5

	486 ISO_8859_6, // 5: Teragram Arabic

	487 ISO_8859_7, // 6: Teragram Greek

	488 MSFT_CP1255, // 7: Teragram Hebrew --> 36

	489 ISO_8859_9, // 8: in BasisTech but not in Teragram

	490 ISO_8859_10, // 9: in BasisTech but not in Teragram

	491 JAPANESE_EUC_JP, // 10: Teragram EUC_JP

	492 JAPANESE_SHIFT_JIS, // 11: Teragram SJS

	493 JAPANESE_JIS, // 12: Teragram JIS

	494 CHINESE_BIG5, // 13: Teragram BIG5

	495 CHINESE_GB, // 14: Teragram GB

	496 CHINESE_EUC_CN, // 15: Teragram EUC-CN

	497 KOREAN_EUC_KR, // 16: Teragram KSC

	498 UNICODE, // 17: Teragram Unicode

	499 CHINESE_EUC_CN, // 18: Teragram EUC --> 15

	500 CHINESE_EUC_CN, // 19: Teragram CNS --> 15

	501 CHINESE_BIG5, // 20: Teragram BIG5_CP950 --> 13

	502 JAPANESE_SHIFT_JIS, // 21: Teragram CP932 --> 11

	503 UTF8, // 22

	504 UNKNOWN_ENCODING, // 23

	505 ISO_8859_1, // 24: ISO_8859_1 with all characters <= 127 --> 0

	506 RUSSIAN_KOI8_R, // 25: Teragram KOI8R

	507 RUSSIAN_CP1251, // 26: Teragram CP1251

	508 ISO_8859_1, // 27: CP1252 aka MSFT euro ascii --> 0

	509 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian

	510 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european

	511 ISO_8859_1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0

	512 ISO_8859_9, // 31: used for Turkish

	513 ISO_8859_13, // 32: used in Baltic countries --> 43

	514 ISO_8859_11, // 33: aka TIS-620, used for Thai

	515 ISO_8859_11, // 34: used for Thai --> 33

	516 MSFT_CP1256, // 35: used for Arabic

	517 MSFT_CP1255, // 36: Logical Hebrew Microsoft

	518 MSFT_CP1255, // 37: Iso Hebrew Logical --> 36

	519 MSFT_CP1255, // 38: Iso Hebrew Visual --> 36

	520 CZECH_CP852, // 39

	521 ISO_8859_2, // 40: aka ISO_IR_139 aka KOI8_CS --> 1

	522 MSFT_CP1253, // 41: used for Greek, but NOT a superset of 8859-7

	523 RUSSIAN_CP866, // 42

	524 ISO_8859_13, // 43

	525 ISO_2022_KR, // 44

	526 CHINESE_GB, // 45 GBK --> 14

	527 CHINESE_GB, // 46 GB18030 --> 14

	528 CHINESE_BIG5, // 47 BIG5_HKSCS --> 13

	529 ISO_2022_KR, // 48 ISO_2022_CN --> 44

	530 TSCII, // 49 Indic encoding

	531 TAMIL_MONO, // 50 Indic encoding - Tamil

	532 TAMIL_BI, // 51 Indic encoding - Tamil

	533 JAGRAN, // 52 Indic encoding - Devanagari

	534 MACINTOSH_ROMAN, // 53

	535 UTF7, // 54

	536 BHASKAR, // 55 Indic encoding - Devanagari

	537 HTCHANAKYA, // 56 Indic encoding - Devanagari

	538 UTF16BE, // 57

	539 UTF16LE, // 58

	540 UTF32BE, // 59

	541 UTF32LE, // 60

	542 BINARYENC, // 61

	543 HZ_GB_2312, // 62

	544 UTF8UTF8, // 63

	545 TAM_ELANGO, // 64 Elango - Tamil

	546 TAM_LTTMBARANI, // 65 Barani - Tamil

	547 TAM_SHREE, // 66 Shree - Tamil

	548 TAM_TBOOMIS, // 67 TBoomis - Tamil

	549 TAM_TMNEWS, // 68 TMNews - Tamil

	550 TAM_WEBTAMIL, // 69 Webtamil - Tamil

	551 KDDI_SHIFT_JIS, // 70 KDDI Shift_JIS

	552 DOCOMO_SHIFT_JIS, // 71 DoCoMo Shift_JIS

	553 SOFTBANK_SHIFT_JIS, // 72 SoftBank Shift_JIS

	554 KDDI_ISO_2022_JP, // 73 KDDI ISO-2022-JP

	555 SOFTBANK_ISO_2022_JP, // 74 SOFTBANK ISO-2022-JP

	556 };

	557

	558 COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS,

	559 kMapEncToBaseEncoding_has_incorrect_size);

	560

	561 // Maps base encodings to 0, supersets to 1+, undesired to -1

	562 // (Non-identity mappings are marked "-->" below.)

	563 static const int kMapEncToSuperLevel[] = {

	564 0, // 0: Teragram ASCII

	565 0, // 1: Teragram Latin2

	566 0, // 2: in BasisTech but not in Teragram

	567 0, // 3: Teragram Latin4

	568 0, // 4: Teragram ISO-8859-5

	569 0, // 5: Teragram Arabic

	570 0, // 6: Teragram Greek

	571 0, // 7: Teragram Hebrew

	572 0, // 8: in BasisTech but not in Teragram

	573 0, // 9: in BasisTech but not in Teragram

	574 0, // 10: Teragram EUC_JP

	575 0, // 11: Teragram SJS

	576 0, // 12: Teragram JIS

	577 0, // 13: Teragram BIG5

	578 0, // 14: Teragram GB

	579 0, // 15: Teragram EUC-CN

	580 0, // 16: Teragram KSC

	581 0, // 17: Teragram Unicode

	582 -1, // 18: Teragram EUC --> 15

	583 -1, // 19: Teragram CNS --> 15

	584 1, // 20: Teragram BIG5_CP950 --> 13

	585 1, // 21: Teragram CP932 --> 11

	586 0, // 22

	587 -1, // 23

	588 -1, // 24: ISO_8859_1 with all characters <= 127 --> 0

	589 0, // 25: Teragram KOI8R

	590 0, // 26: Teragram CP1251

	591 1, // 27: CP1252 aka MSFT euro ascii --> 0

	592 0, // 28: CP21866 aka KOI8_RU, used for Ukrainian

	593 0, // 29: CP1250 aka MSFT eastern european

	594 1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0

	595 0, // 31: used for Turkish

	596 1, // 32: used in Baltic countries --> 43

	597 0, // 33: aka TIS-620, used for Thai

	598 1, // 34: used for Thai --> 33

	599 0, // 35: used for Arabic

	600 0, // 36: Logical Hebrew Microsoft

	601 -1, // 37: Iso Hebrew Logical --> 36

	602 -1, // 38: Iso Hebrew Visual --> 7

	603 0, // 39

	604 1, // 40: aka ISO_IR_139 aka KOI8_CS --> 1

	605 0, // 41: used for Greek, NOT superset of 8859-7

	606 0, // 42

	607 0, // 43

	608 0, // 44

	609 1, // 45 GBK --> 14

	610 1, // 46 GB18030 --> 14

	611 1, // 47 BIG5_HKSCS --> 13

	612 1, // 48 ISO_2022_CN --> 44

	613 0, // 49 Indic encoding

	614 0, // 50 Indic encoding - Tamil

	615 0, // 51 Indic encoding - Tamil

	616 0, // 52 Indic encoding - Devanagari

	617 0, // 53

	618 0, // 54

	619 0, // 55 Indic encoding - Devanagari

	620 0, // 56 Indic encoding - Devanagari

	621 0, // 57

	622 0, // 58

	623 0, // 59

	624 0, // 60

	625 0, // 61

	626 0, // 62

	627 2, // 63

	628 0, 0, 0, 0, 0, 0, // add six more Tamil

	629 0, 0, 0, 0, 0, // add five encodings with emoji

	630 };

	631

	632 COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS,

	633 kMapEncToSuperLevel_has_incorrect_size);

	634

	635

	636

	637 // Subscripted by Encoding enum value

	638 static const uint32 kSpecialMask[] = {

	639 kHighAccentCode, // 0

	640 kHighAccentCode,

	641 kHighAccentCode,

	642 kHighAccentCode,

	643 kHighAlphaCode, // 4

	644 kHighAlphaCode,

	645 kHighAlphaCode,

	646 kHighAlphaCode,

	647 kHighAccentCode,

	648 kHighAccentCode,

	649

	650 kTwobyteCode + kEUCJPActive, // 10 euc-jp

	651 kTwobyteCode,

	652 kSevenBitActive + kIso2022Active, // jis

	653 kTwobyteCode,

	654 kTwobyteCode,

	655 kTwobyteCode,

	656 kTwobyteCode,

	657 kSevenBitActive + kUTF1632Active, // Unicode

	658 kTwobyteCode,

	659 kTwobyteCode,

	660

	661 kTwobyteCode, // 20

	662 kTwobyteCode,

	663 kUTF8Active, // UTF-8

	664 0,

	665 0,

	666 kHighAlphaCode, // 25

	667 kHighAlphaCode,

	668 kHighAccentCode,

	669 kHighAlphaCode,

	670 kHighAccentCode,

	671

	672 kHighAccentCode, // 30

	673 kHighAccentCode,

	674 kHighAccentCode,

	675 kHighAlphaCode,

	676 kHighAlphaCode,

	677 kHighAlphaCode, // 35

	678 kHighAlphaCode,

	679 kHighAlphaCode,

	680 kHighAlphaCode,

	681 0,

	682

	683 0, // 40

	684 kHighAlphaCode,

	685 kHighAlphaCode,

	686 kHighAccentCode,

	687 kSevenBitActive + kIso2022Active, // 2022-kr

	688 kTwobyteCode,

	689 kTwobyteCode,

	690 kTwobyteCode,

	691 kSevenBitActive + kIso2022Active, // 2022-cn

	692 kHighAlphaCode + kIsIndicCode, // 49 TSCII

	693

	694 kHighAlphaCode + kIsIndicCode, // 50 TAMIL_MONO

	695 kHighAlphaCode + kIsIndicCode, // 51 TAMIL_BI

	696 kHighAlphaCode + kIsIndicCode, // 52 JAGRAN

	697 kHighAccentCode, // 53 MACINTOSH_ROMAN

	698 kSevenBitActive + kUTF7Active, // 54 UTF-7

	699 kHighAlphaCode + kIsIndicCode, // 55 BHASKAR Indic encoding - Devanagari

	700 kHighAlphaCode + kIsIndicCode, // 56 HTCHANAKYA Indic encoding - Devanag ari

	701 kSevenBitActive + kUTF1632Active, // 57 UTF16BE

	702 kSevenBitActive + kUTF1632Active, // 58 UTF16LE

	703 kSevenBitActive + kUTF1632Active, // 59 UTF32BE

	704 kSevenBitActive + kUTF1632Active, // 60 UTF32LE

	705

	706 kSevenBitActive + kBinaryActive, // 61 BINARYENC

	707 kSevenBitActive + kHzActive, // 62 HZ_GB_2312

	708 kHighAccentCode + kUTF8Active + kUTF8UTF8Active, // 63 UTF8UTF8

	709 kHighAlphaCode + kIsIndicCode, // 64 Elango - Tamil

	710 kHighAlphaCode + kIsIndicCode, // 65 Barani - Tamil

	711 kHighAlphaCode + kIsIndicCode, // 66 Shree - Tamil

	712 kHighAlphaCode + kIsIndicCode, // 67 TBoomis - Tamil

	713 kHighAlphaCode + kIsIndicCode, // 68 TMNews - Tamil

	714 kHighAlphaCode + kIsIndicCode, // 69 Webtamil - Tamil

	715 kTwobyteCode, // 70 KDDI Shift_JIS

	716 kTwobyteCode, // 71 DoCoMo Shift_JIS

	717 kTwobyteCode, // 72 SoftBank Shift_JIS

	718 kSevenBitActive + kIso2022Active, // 73 KDDI-ISO-2022-JP

	719 kSevenBitActive + kIso2022Active, // 74 SOFTBANK-ISO-2022-JP

	720 };

	721

	722 COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS,

	723 kSpecialMask_has_incorrect_size);

	724

	725

	726 /***

	727 kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents

	728

	729 ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd

	730 RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef

	731 RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef

	732 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef

	733 RUSSIAN_CP866, // 42 89ae

	734

	735 ISO_8859_6, // 5: Teragram Arabic nocase cde

	736 MSFT_CP1256, // 35: used for Arabic nocase cde

	737

	738 ISO_8859_7, // 6: Teragram Greek UL cdef

	739 MSFT_CP1253, // 41: used for Greek UL cdef

	740

	741 ISO_8859_8, // 7: Teragram Hebrew nocase ef

	742 MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef

	743 ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef

	744 HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef

	745

	746 ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde

	747 MSFT_CP874, // 34: used for Thai nocase abcde

	748

	749 TSCII, // 49 8-f

	750 TAMIL_MONO, // 50

	751 TAMIL_BI, // 51

	752 JAGRAN, // 52

	753 BHASKAR, // 55 Indic encoding - Devanagari

	754 HTCHANAKYA, // 56 Indic encoding - Devanagari

	755 ***/

	756

	757 // We can scan bytes using this at about 500 MB/sec 2.8GHz P4

	758 // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~

	759 // We allow FF, 0x0C, here because it gives a better result for old

	760 // Ascii text formatted for a TTY

	761 // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise

	762 static const char kTestPrintableAsciiTildePlus[256] = {

	763 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	764 0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	765 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	766 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2,

	767

	768 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	769 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	770 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	771 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	772 };

	773

	774 // We can scan bytes using this at about 550 MB/sec 2.8GHz P4

	775 // Slow scan uses this, stopping on NUL ESC SO SI and bad C0

	776 // after Hz and UTF7 are pruned away

	777 // We allow Form Feed, 0x0C, here

	778 static const char kTestPrintableAscii[256] = {

	779 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	780 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	781 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	782 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2,

	783

	784 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	785 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	786 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	787 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

	788 };

	789

	790 // Used in first-four-byte testing

	791 static const char kIsPrintableAscii[256] = {

	792 0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	793 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

	794 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

	795 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0,

	796

	797 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	798 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	799 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	800 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	801 };

	802

	803

	804 static const signed char kBase64Value[256] = {

	805 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	806 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	807 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63,

	808 52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1,

	809

	810 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,

	811 15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1,

	812 -1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,

	813 41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1,

	814

	815 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	816 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	817 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	818 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	819

	820 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	821 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	822 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	823 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,

	824 };

	825

	826

	827 // Subscripted by <state, byte/16>

	828 // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x

	829 //

	830 // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9

	831 // which we can mis-parse as an error byte followed by good UTF-8:

	832 // B2 DBB8 D6BD E1B9B9

	833 // To counteract this, we now require an ASCII7 byte to resync out

	834 // of the error state

	835 // Next problem: good UTF-8 with bad byte

	836 // efbc a012 eea4 bee7 b280 c2b7

	837 // efbca0 12 eea4be e7b280 c2b7

	838 // ^^ bad byte

	839 // fix: change state0 byte 1x to be don't-care

	840 //

	841 // Short UTF-8 ending in ASCII7 byte should resync immediately:

	842 // E0 20 E0 A6 AA should give one error and resync at 2nd E0

	843 //

	844 static const char kMiniUTF8State[8][16] = {

	845 {0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,}, // [0] start char (allow cr/lf/ht)

	846 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [1] continue 1 of 2

	847 {0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,}, // [2] continue 1 of 3

	848 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [3] continue 2 of 3

	849 {0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,}, // [4] continue 1 of 4

	850 {0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,}, // [5] continue 2 of 4

	851 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [6] continue 3 of 4

	852 {0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,}, // [7] error, soak up continues,

	853 // ONLY resync after Ascii char

	854 // then restart

	855 };

	856 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B

	857 static const char kMiniUTF8Count[8][16] = {

	858 {0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,}, // [0] start char (allow cr/lf/ht)

	859 {1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,}, // [1] continue 1 of 2

	860 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [2] continue 1 of 3

	861 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [3] continue 2 of 3

	862 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [4] continue 1 of 4

	863 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] continue 2 of 4

	864 {1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,}, // [6] continue 3 of 4

	865 {0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,}, // [7] error, soak up continues,

	866 // then restart

	867 };

	868

	869 // Subscripted by <state, f(byte1) + g(byte2)>

	870 // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise

	871 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.

	872 // (no checking for illegal bytes)

	873 // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want

	874 // to detect two, so we can back-convert to one.

	875 // zero one two pattern

	876 // ---- ------ ---------------- -----------------

	877 // 81 C281 C382C281 C3->8x->C2->xx

	878 // 98 CB9C C38BC593 C3->8x->C5->xx

	879 // C3 C383 C383C692 C3->8x->C6->xx

	880 // C8 C388 C383CB86 C3->8x->CB->xx

	881 // 83 C692 C386E28099 C3->8x->E2->xx->8x

	882 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx

	883 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx

	884 //

	885 // We also want to detect bare-byte extra UTF-8 conversions:

	886 // zero one two pattern

	887 // ---- ------ ---------------- -----------------

	888 // C3 C3 C383 C3->8x->C2->xx

	889 // D3 D3 C393 C3->9x->C2->xx->C2->xx

	890 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx

	891 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx

	892 //

	893

	894 /**

	895 CP1252 => UTF8 => UTF8UTF8

	896 80 => E282AC => C3A2E2809AC2AC

	897 81 => C281 => C382C281

	898 82 => E2809A => C3A2E282ACC5A1

	899 83 => C692 => C386E28099

	900 84 => E2809E => C3A2E282ACC5BE

	901 85 => E280A6 => C3A2E282ACC2A6

	902 86 => E280A0 => C3A2E282ACC2A0

	903 87 => E280A1 => C3A2E282ACC2A1

	904 88 => CB86 => C38BE280A0

	905 89 => E280B0 => C3A2E282ACC2B0

	906 8A => C5A0 => C385C2A0

	907 8B => E280B9 => C3A2E282ACC2B9

	908 8C => C592 => C385E28099

	909 8D => C28D => C382C28D

	910 8E => C5BD => C385C2BD

	911 8F => C28F => C382C28F

	912 90 => C290 => C382C290

	913 91 => E28098 => C3A2E282ACCB9C

	914 92 => E28099 => C3A2E282ACE284A2

	915 93 => E2809C => C3A2E282ACC593

	916 94 => E2809D => C3A2E282ACC29D

	917 95 => E280A2 => C3A2E282ACC2A2

	918 96 => E28093 => C3A2E282ACE2809C

	919 97 => E28094 => C3A2E282ACE2809D

	920 98 => CB9C => C38BC593

	921 99 => E284A2 => C3A2E2809EC2A2

	922 9A => C5A1 => C385C2A1

	923 9B => E280BA => C3A2E282ACC2BA

	924 9C => C593 => C385E2809C

	925 9D => C29D => C382C29D

	926 9E => C5BE => C385C2BE

	927 9F => C5B8 => C385C2B8

	928 A0 => C2A0 => C382C2A0

	929 A1 => C2A1 => C382C2A1

	930 A2 => C2A2 => C382C2A2

	931 A3 => C2A3 => C382C2A3

	932 A4 => C2A4 => C382C2A4

	933 A5 => C2A5 => C382C2A5

	934 A6 => C2A6 => C382C2A6

	935 A7 => C2A7 => C382C2A7

	936 A8 => C2A8 => C382C2A8

	937 A9 => C2A9 => C382C2A9

	938 AA => C2AA => C382C2AA

	939 AB => C2AB => C382C2AB

	940 AC => C2AC => C382C2AC

	941 AD => C2AD => C382C2AD

	942 AE => C2AE => C382C2AE

	943 AF => C2AF => C382C2AF

	944 B0 => C2B0 => C382C2B0

	945 B1 => C2B1 => C382C2B1

	946 B2 => C2B2 => C382C2B2

	947 B3 => C2B3 => C382C2B3

	948 B4 => C2B4 => C382C2B4

	949 B5 => C2B5 => C382C2B5

	950 B6 => C2B6 => C382C2B6

	951 B7 => C2B7 => C382C2B7

	952 B8 => C2B8 => C382C2B8

	953 B9 => C2B9 => C382C2B9

	954 BA => C2BA => C382C2BA

	955 BB => C2BB => C382C2BB

	956 BC => C2BC => C382C2BC

	957 BD => C2BD => C382C2BD

	958 BE => C2BE => C382C2BE

	959 BF => C2BF => C382C2BF

	960 C0 => C380 => C383E282AC

	961 C1 => C381 => C383C281

	962 C2 => C382 => C383E2809A

	963 C3 => C383 => C383C692

	964 C4 => C384 => C383E2809E

	965 C5 => C385 => C383E280A6

	966 C6 => C386 => C383E280A0

	967 C7 => C387 => C383E280A1

	968 C8 => C388 => C383CB86

	969 C9 => C389 => C383E280B0

	970 CA => C38A => C383C5A0

	971 CB => C38B => C383E280B9

	972 CC => C38C => C383C592

	973 CD => C38D => C383C28D

	974 CE => C38E => C383C5BD

	975 CF => C38F => C383C28F

	976 D0 => C390 => C383C290

	977 D1 => C391 => C383E28098

	978 D2 => C392 => C383E28099

	979 D3 => C393 => C383E2809C

	980 D4 => C394 => C383E2809D

	981 D5 => C395 => C383E280A2

	982 D6 => C396 => C383E28093

	983 D7 => C397 => C383E28094

	984 D8 => C398 => C383CB9C

	985 D9 => C399 => C383E284A2

	986 DA => C39A => C383C5A1

	987 DB => C39B => C383E280BA

	988 DC => C39C => C383C593

	989 DD => C39D => C383C29D

	990 DE => C39E => C383C5BE

	991 DF => C39F => C383C5B8

	992 E0 => C3A0 => C383C2A0

	993 E1 => C3A1 => C383C2A1

	994 E2 => C3A2 => C383C2A2

	995 E3 => C3A3 => C383C2A3

	996 E4 => C3A4 => C383C2A4

	997 E5 => C3A5 => C383C2A5

	998 E6 => C3A6 => C383C2A6

	999 E7 => C3A7 => C383C2A7

	1000 E8 => C3A8 => C383C2A8

	1001 E9 => C3A9 => C383C2A9

	1002 EA => C3AA => C383C2AA

	1003 EB => C3AB => C383C2AB

	1004 EC => C3AC => C383C2AC

	1005 ED => C3AD => C383C2AD

	1006 EE => C3AE => C383C2AE

	1007 EF => C3AF => C383C2AF

	1008 F0 => C3B0 => C383C2B0

	1009 F1 => C3B1 => C383C2B1

	1010 F2 => C3B2 => C383C2B2

	1011 F3 => C3B3 => C383C2B3

	1012 F4 => C3B4 => C383C2B4

	1013 F5 => C3B5 => C383C2B5

	1014 F6 => C3B6 => C383C2B6

	1015 F7 => C3B7 => C383C2B7

	1016 F8 => C3B8 => C383C2B8

	1017 F9 => C3B9 => C383C2B9

	1018 FA => C3BA => C383C2BA

	1019 FB => C3BB => C383C2BB

	1020 FC => C3BC => C383C2BC

	1021 FD => C3BD => C383C2BD

	1022 FE => C3BE => C383C2BE

	1023 FF => C3BF => C383C2BF

	1024 **/

	1025

	1026 // Subscripted by <state, f(byte1) + g(byte2)>

	1027 // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise

	1028 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.

	1029

	1030 // 81 C281 C382C281 C3->8x->C2->xx

	1031 // 98 CB9C C38BC593 C3->8x->C5->xx

	1032 // C3 C383 C383C692 C3->8x->C6->xx

	1033 // C8 C388 C383CB86 C3->8x->CB->xx

	1034 // [0] [2] [0]

	1035 // 83 C692 C386E28099 C3->8x->E2->xx->xx

	1036 // odd_byte=0 [0] [2] [0+] odd_byte flipped

	1037 // odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped

	1038 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx

	1039 // odd_byte=0 [0] [3] [4] [0+]

	1040 // odd_byte=1 [0+] [3] [4] [4] [0]

	1041 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx

	1042 // odd_byte=0 [0] [3] [4] [0] [0]

	1043 // odd_byte=1 [0+] [3] [4] [4] [0+]

	1044 //

	1045 // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip

	1046 // the odd_byte state. If that goes from 0 to 1, the next pair is offset up

	1047 // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes

	1048 // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx.

	1049 // These are absorbed with no error in state 0 or state 4

	1050 //

	1051 // C3 C3 C383 C3->8x->C2->xx

	1052 // D3 D3 C393 C3->9x->C2->xx->C2->xx

	1053 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx

	1054 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx

	1055 // Counter3 for Fx Ex sequences is incremented at last C2

	1056

	1057 static const char kMiniUTF8UTF8State[8][16] = {

	1058 // xxxx E2xx CXxx C3xx

	1059 // 8 9 a b 8 9 a b 8 9 a b

	1060 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [0] looking for C38x/C3Ax/2020/8x 8x, or err

	1061 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [1] error, back to looking

	1062 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx

	1063 // + + + + // E2xxxx flips odd_byte

	1064 {1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,}, // [3] C3Ax looking for E2xx or C2xx C2xx

	1065 // + + + + // E2xxxx flips odd_byte

	1066 {4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E 2xxxx

	1067 // + + + + // E2xxxx flips odd_byte

	1068 {1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC 2xx

	1069 {1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx

	1070 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [7] C3Bx -- looking for C2xx

	1071 };

	1072 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B

	1073 static const char kMiniUTF8UTF8Count[8][16] = {

	1074 // xxxx E2xx C2Xx C3xx

	1075 // 8 9 a b 8 9 a b 8 9 a b

	1076 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x 8x, or err

	1077 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [1] error, back to looking

	1078 {1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx

	1079 // + + + + // E2xxxx flips odd_byte

	1080 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [3] C3Ax looking for E2xx

	1081 // + + + + // E2xxxx flips odd_byte

	1082 {1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E 2xxxx

	1083 // + + + + // E2xxxx flips odd_byte

	1084 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC 2xx

	1085 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx

	1086 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [7] C3Bx -- looking for C2xx

	1087 };

	1088

	1089 static const char kMiniUTF8UTF8Odd[8][16] = {

	1090 // xxxx E2xx C2Xx C3xx

	1091 // 8 9 a b 8 9 a b 8 9 a b

	1092 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x 8x, or err

	1093 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [1] error, back to looking

	1094 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [2] C38x looking for CXxx/E2xxxx

	1095 // + + + + // E2xxxx flips odd_byte

	1096 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [3] C3Ax looking for E2xx

	1097 // + + + + // E2xxxx flips odd_byte

	1098 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [4] C3AxE2xx-- looking for C2xx/E 2xxxx

	1099 // + + + + // E2xxxx flips odd_byte

	1100 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [5] C3Bx -- looking for C2xxC2xxC 2xx

	1101 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [6] C3Bx -- looking for C2xxC2xx

	1102 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [7] C3Bx -- looking for C2xx

	1103 };

	1104

	1105 // Turn a pair of bytes into the subscript for UTF8UTF8 tables above

	1106 int UTF88Sub(char s0, char s1) {

	1107 int sub = (s1 >> 4) & 0x03;

	1108 uint8 u0 = static_cast<uint8>(s0);

	1109 if (u0 == 0xc3) {

	1110 sub += 12;

	1111 } else if ((u0 & 0xf0) == 0xc0) {

	1112 if ((u0 == 0xc2) \|\| (u0 == 0xc5) \|\| (u0 == 0xc6) \|\| (u0 == 0xcb)) {

	1113 sub += 8;

	1114 }

	1115 } else if (u0 == 0xe2) {

	1116 sub += 4;

	1117 }

	1118 return sub;

	1119 }

	1120

	1121

	1122

	1123

	1124

	1125 // Default probability for an encoding rankedencoding

	1126 // Based on a scan of 55M web pages

	1127 // These values are 255 - log base 2**1/10 (occurrences / total)

	1128 // Large values are most likely. This the reverse of some Google code

	1129 // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M)

	1130 //

	1131 // TODO change this to be per encoding, not permuted

	1132 //

	1133

	1134

	1135 // Support function for unit test program

	1136 // Return ranked encoding corresponding to enc

	1137 // (also exported to compact_enc_det_text.cc)

	1138 int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) {

	1139 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {

	1140 if (kMapToEncoding[i] == enc) {

	1141 return i;

	1142 }

	1143 }

	1144 return -1;

	1145 }

	1146

	1147

	1148 string DecodeActive(uint32 active) {

	1149 string temp("");

	1150 if (active & kBinaryActive) {

	1151 temp.append("Binary ");

	1152 }

	1153 if (active & kUTF1632Active) {

	1154 temp.append("UTF1632 ");

	1155 }

	1156 if (active & kUTF8UTF8Active) {

	1157 temp.append("UTF8UTF8 ");

	1158 }

	1159 if (active & kUTF8Active) {

	1160 temp.append("UTF8 ");

	1161 }

	1162 if (active & kIso2022Active) {

	1163 temp.append("Iso2022 ");

	1164 }

	1165 if (active & kHzActive) {

	1166 temp.append("Hz ");

	1167 }

	1168 if (active & kUTF7Active) {

	1169 temp.append("UTF7A ");

	1170 }

	1171 if (active & kSevenBitActive) {

	1172 temp.append("SevenBit ");

	1173 }

	1174 if (active & kIsIndicCode) {

	1175 temp.append("Indic ");

	1176 }

	1177 if (active & kHighAlphaCode) {

	1178 temp.append("HighAlpha ");

	1179 }

	1180 if (active & kHighAccentCode) {

	1181 temp.append("HighAccent ");

	1182 }

	1183 if (active & kEUCJPActive) {

	1184 temp.append("EUCJP ");

	1185 }

	1186 return temp;

	1187 }

	1188

	1189 static inline bool SevenBitEncoding(int enc) {

	1190 return ((kSpecialMask[enc] & kSevenBitActive) != 0);

	1191 }

	1192 static inline bool TwoByteEncoding(int enc) {

	1193 return ((kSpecialMask[enc] & kTwobyteCode) != 0);

	1194 }

	1195 static inline bool IndicEncoding(int enc) {

	1196 return ((kSpecialMask[enc] & kIsIndicCode) != 0);

	1197 }

	1198 static inline bool HighAlphaEncoding(int enc) {

	1199 return ((kSpecialMask[enc] & kHighAlphaCode) != 0);

	1200 }

	1201 static inline bool HighAccentEncoding(int enc) {

	1202 return ((kSpecialMask[enc] & kHighAccentCode) != 0);

	1203 }

	1204

	1205

	1206 static inline bool AnyActive(DetectEncodingState* destatep) {

	1207 return (destatep->active_special != 0);

	1208 }

	1209 static inline bool SevenBitActive(DetectEncodingState* destatep) {

	1210 return (destatep->active_special & kSevenBitActive) != 0;

	1211 }

	1212

	1213 static inline bool UTF7Active(DetectEncodingState* destatep) {

	1214 return (destatep->active_special & kUTF7Active) != 0;

	1215 }

	1216

	1217 static inline bool HzActive(DetectEncodingState* destatep) {

	1218 return (destatep->active_special & kHzActive) != 0;

	1219 }

	1220 static inline bool Iso2022Active(DetectEncodingState* destatep) {

	1221 return (destatep->active_special & kIso2022Active) != 0;

	1222 }

	1223 static inline bool UTF8Active(DetectEncodingState* destatep) {

	1224 return (destatep->active_special & kUTF8Active) != 0;

	1225 }

	1226 static inline bool UTF8UTF8Active(DetectEncodingState* destatep) {

	1227 return (destatep->active_special & kUTF8UTF8Active) != 0;

	1228 }

	1229 static inline bool UTF1632Active(DetectEncodingState* destatep) {

	1230 return (destatep->active_special & kUTF1632Active) != 0;

	1231 }

	1232 static inline bool BinaryActive(DetectEncodingState* destatep) {

	1233 return (destatep->active_special & kBinaryActive) != 0;

	1234 }

	1235 static inline bool UTF7OrHzActive(DetectEncodingState* destatep) {

	1236 return (destatep->active_special & (kHzActive + kUTF7Active)) != 0;

	1237 }

	1238 static inline bool EUCJPActive(DetectEncodingState* destatep) {

	1239 return ((destatep->active_special & kEUCJPActive) != 0);

	1240 }

	1241 static inline bool OtherActive(DetectEncodingState* destatep) {

	1242 return (destatep->active_special & (kIso2022Active + kBinaryActive +

	1243 kUTF8Active + kUTF8UTF8Active +

	1244 kUTF1632Active + kEUCJPActive)) != 0;

	1245 }

	1246

	1247

	1248 static inline bool CEDFlagRescanning(CEDInternalFlags flags) {

	1249 return (flags & kCEDRescanning) != 0;

	1250 }

	1251

	1252 static inline bool CEDFlagSlowscore(CEDInternalFlags flags) {

	1253 return (flags & kCEDSlowscore) != 0;

	1254 }

	1255

	1256 static inline bool CEDFlagForceTags(CEDInternalFlags flags) {

	1257 return (flags & kCEDForceTags) != 0;

	1258 }

	1259

	1260

	1261 static inline int maxint(int a, int b) {return (a > b) ? a : b;}

	1262 static inline int minint(int a, int b) {return (a < b) ? a : b;}

	1263

	1264 static inline const char* MyRankedEncName(int r_enc) {

	1265 return MyEncodingName(kMapToEncoding[r_enc]);

	1266 }

	1267

	1268

	1269 // Only for debugging. not thread safe

	1270 static const int kPsSourceWidth = 32;

	1271 static int pssourcenext = 0; // debug only. not threadsafe. dump only >= this

	1272 static int pssourcewidth = 0; // debug only.

	1273 static char* pssource_mark_buffer = NULL;

	1274 int next_do_src_line;

	1275 int do_src_offset[16];

	1276

	1277

	1278 void PsSourceInit(int len) {

	1279 pssourcenext = 0;

	1280 pssourcewidth = len;

	1281 delete[] pssource_mark_buffer;

	1282 // Allocate 2 Ascii characters per input byte

	1283 pssource_mark_buffer = new char[(pssourcewidth * 2) + 8]; // 8 = overscan

	1284 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);

	1285 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);

	1286

	1287 next_do_src_line = 0;

	1288 memset(do_src_offset, 0, sizeof(do_src_offset));

	1289 }

	1290

	1291 void PsSourceFinish() {

	1292 // Print preceding mark buffer

	1293 int j = (pssourcewidth * 2) - 1;

	1294 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim

	1295 pssource_mark_buffer[j + 1] = '\0';

	1296 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);

	1297 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);

	1298 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);

	1299

	1300 delete[] pssource_mark_buffer;

	1301 pssource_mark_buffer = NULL;

	1302 }

	1303

	1304 // Dump aligned len bytes src... if not already dumped

	1305 void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) {

	1306 int offset = src - isrc;

	1307 offset -= (offset % pssourcewidth); // round down to multiple of len bytes

	1308 if (offset < pssourcenext) {

	1309 return;

	1310 }

	1311 pssourcenext = offset + pssourcewidth; // Min offset for next dump

	1312

	1313 // Print preceding mark buffer

	1314 int j = (pssourcewidth * 2) - 1;

	1315 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim

	1316 pssource_mark_buffer[j + 1] = '\0';

	1317 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);

	1318 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);

	1319 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);

	1320

	1321 // Print source bytes

	1322 const uint8* src_aligned = isrc + offset;

	1323 int length = srclimit - src_aligned;

	1324 length = minint(pssourcewidth, length);

	1325

	1326 fprintf(stderr, "(%05x ", offset);

	1327 for (int i = 0; i < length; ++i) {

	1328 char c = src_aligned[i];

	1329 if (c == '\n') {c = ' ';}

	1330 if (c == '\r') {c = ' ';}

	1331 if (c == '\t') {c = ' ';}

	1332 if (c == '(') {

	1333 fprintf(stderr, "%s", "\\( ");

	1334 } else if (c == ')') {

	1335 fprintf(stderr, "%s", "\\) ");

	1336 } else if (c == '\\') {

	1337 fprintf(stderr, "%s", "\\\\ ");

	1338 } else if ((0x20 <= c) && (c <= 0x7e)) {

	1339 fprintf(stderr, "%c ", c);

	1340 } else {

	1341 fprintf(stderr, "%02x", c);

	1342 }

	1343 }

	1344 fprintf(stderr, ") do-src\n");

	1345 // Remember which source offsets are where, mod 16

	1346 do_src_offset[next_do_src_line & 0x0f] = offset;

	1347 ++next_do_src_line;

	1348 }

	1349

	1350 // Mark bytes in just-previous source bytes

	1351 void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) {

	1352 int offset = src - isrc;

	1353 offset = (offset % pssourcewidth); // mod len bytes

	1354 char mark = (weightshift == 0) ? '-' : 'x';

	1355

	1356 pssource_mark_buffer[(offset * 2)] = '=';

	1357 pssource_mark_buffer[(offset * 2) + 1] = '=';

	1358 for (int i = 1; i < len; ++i) {

	1359 pssource_mark_buffer[(offset + i) * 2] = mark;

	1360 pssource_mark_buffer[((offset + i) * 2) + 1] = mark;

	1361 }

	1362 }

	1363

	1364

	1365 // Highlight trigram bytes in just-previous source bytes

	1366 // Unfortunately, we have to skip back N lines since source was printed for

	1367 // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better

	1368 void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) {

	1369 int offset = (src + 1) - isrc;

	1370 int offset32 = (offset % pssourcewidth); // mod len bytes

	1371 offset -= offset32; // round down to multiple of len bytes

	1372

	1373 for (int i = 1; i <= 16; ++i) {

	1374 if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) {

	1375 fprintf(stderr, "%d %d %d do-highlight%d\n",

	1376 i, offset32 - 1, trigram_val, n);

	1377 break;

	1378 }

	1379 }

	1380 }

	1381

	1382

	1383 void InitDetectEncodingState(DetectEncodingState* destatep) {

	1384 destatep->initial_src = NULL; // Filled in by caller

	1385 destatep->limit_src = NULL;

	1386 destatep->prior_src = NULL;

	1387 destatep->last_pair = NULL;

	1388

	1389 destatep->debug_data = NULL;

	1390 destatep->next_detail_entry = 0;

	1391

	1392 destatep->done = false;

	1393 destatep->reliable = false;

	1394 destatep->hints_derated = false;

	1395 //destatep->declared_enc_1 init in ApplyHints

	1396 //destatep->declared_enc_2 init in ApplyHints

	1397 destatep->prune_count = 0;

	1398

	1399 destatep->trigram_highwater_mark = 0;

	1400 destatep->looking_for_latin_trigrams = false;

	1401 destatep->do_latin_trigrams = false;

	1402

	1403 // Miscellaneous state variables for difficult encodings

	1404 destatep->binary_quadrants_count = 0;

	1405 destatep->binary_8x4_count = 0;

	1406 destatep->binary_quadrants_seen = 0;

	1407 destatep->binary_8x4_seen = 0;

	1408 destatep->utf7_starts = 0;

	1409 destatep->prior_utf7_offset = 0;

	1410 destatep->next_utf8_ministate = 0;

	1411 for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;}

	1412 destatep->next_utf8utf8_ministate = 0;

	1413 destatep->utf8utf8_odd_byte = 0;

	1414 for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;}

	1415 destatep->next_2022_state = SOSI_NONE;

	1416 destatep->next_hz_state = SOSI_NONE;

	1417 destatep->next_eucjp_oddphase = false;

	1418 for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;}

	1419 destatep->active_special = 0xffffffff;

	1420 destatep->tld_hint = UNKNOWN_ENCODING;

	1421 destatep->http_hint = UNKNOWN_ENCODING;

	1422 destatep->meta_hint = UNKNOWN_ENCODING;

	1423 destatep->bom_hint = UNKNOWN_ENCODING;

	1424 destatep->top_rankedencoding = 0; // ASCII [seven-bit] is the default

	1425 destatep->second_top_rankedencoding = 0; // ASCII [seven-bit] is the default

	1426 destatep->top_prob = -1;

	1427 destatep->second_top_prob = -1;

	1428 // This is wide for first pruning, shrinks for 2nd and later

	1429 destatep->prune_difference = kInititalPruneDifference;

	1430

	1431 destatep->next_prior_bigram = 0;

	1432 destatep->prior_bigram[0] = -1;

	1433 destatep->prior_bigram[1] = -1;

	1434 destatep->prior_bigram[2] = -1;

	1435 destatep->prior_bigram[3] = -1;

	1436

	1437 destatep->prior_binary[0] = -1;

	1438

	1439 // Initialize with all but Indic encodings, which we never detect

	1440 int k = 0;

	1441 for (int rankedencoding = 0;

	1442 rankedencoding < NUM_RANKEDENCODING;

	1443 rankedencoding++) {

	1444 Encoding enc = kMapToEncoding[rankedencoding];

	1445 if (!IndicEncoding(enc)) {

	1446 destatep->rankedencoding_list[k++] = rankedencoding;

	1447 }

	1448 }

	1449 destatep->rankedencoding_list_len = k;

	1450

	1451 // This is where all the action is

	1452 memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob));

	1453

	1454 memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob));

	1455 memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight));

	1456

	1457 destatep->prior_interesting_pair[AsciiPair] = 0;

	1458 destatep->prior_interesting_pair[OtherPair] = 0;

	1459 destatep->next_interesting_pair[AsciiPair] = 0;

	1460 destatep->next_interesting_pair[OtherPair] = 0;

	1461 // interesting_pairs/offsets/weightshifts not initialized; no need

	1462 }

	1463

	1464 // Probability strings are uint8, with zeros removed via simple run-length:

	1465 // (<skip-take byte> <data bytes>)*

	1466 // skip-take:

	1467 // 00 end

	1468 // x0 skip 16 x locations, take 0 data values

	1469 // xy skip x locations, take y data values

	1470 // Multiply all the incoming values by 3 to account for 3x unigram sums

	1471 //

	1472 // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35,

	1473 // 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255"

	1474 //

	1475 // Weight is 0..100 percent

	1476 //

	1477 // Returns subscript of largest (most probable) value

	1478 //

	1479

	1480

	1481 // {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3 b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__"

	1482 // // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49 Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASC II-7-bit]

	1483 int ApplyCompressedProb(const char* iprob, int len,

	1484 int weight, DetectEncodingState* destatep) {

	1485 int* dst = &destatep->enc_prob[0];

	1486 int* dst2 = &destatep->hint_weight[0];

	1487 const uint8* prob = reinterpret_cast<const uint8*>(iprob);

	1488 const uint8* problimit = prob + len;

	1489

	1490 int largest = -1;

	1491 int subscript_of_largest = 0;

	1492

	1493 // Continue with first byte and subsequent ones

	1494 while (prob < problimit) {

	1495 int skiptake = *prob++;

	1496 int skip = (skiptake & 0xf0) >> 4;

	1497 int take = skiptake & 0x0f;

	1498 if (skiptake == 00) {

	1499 break;

	1500 } else if (take == 0) {

	1501 dst += (skip << 4);

	1502 dst2 += (skip << 4);

	1503 } else {

	1504 dst += skip; // Normal case

	1505 dst2 += skip; // Normal case

	1506 for (int i = 0; i < take; i++) {

	1507 int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i;

	1508 if (largest < prob[i]) {

	1509 largest = prob[i];

	1510 subscript_of_largest = enc;

	1511 }

	1512

	1513 int increment = prob[i] * 3; // The actual increment

	1514

	1515 // Do maximum of previous hints plus this new one

	1516 if (weight > 0) {

	1517 increment = (increment * weight) / 100;

	1518 dst[i] = maxint(dst[i], increment);

	1519 dst2[i] = 1; // New total weight

	1520 }

	1521 }

	1522 prob += take;

	1523 dst += take;

	1524 dst2 += take;

	1525 }

	1526 }

	1527 return subscript_of_largest;

	1528 }

	1529

	1530

	1531 // Returns subscript of largest (most probable) value [for unit test]

	1532 int TopCompressedProb(const char* iprob, int len) {

	1533 const uint8* prob = reinterpret_cast<const uint8*>(iprob);

	1534 const uint8* problimit = prob + len;

	1535 int next_prob_sub = 0;

	1536 int topprob = 0;

	1537 int toprankenc = 0;

	1538

	1539 while (prob < problimit) {

	1540 int skiptake = *prob++;

	1541 int skip = (skiptake & 0xf0) >> 4;

	1542 int take = skiptake & 0x0f;

	1543 if (skiptake == 0) {

	1544 break;

	1545 } else if (take == 0) {

	1546 next_prob_sub += (skip << 4);

	1547 } else {

	1548 next_prob_sub += skip; // Normal case

	1549 for (int i = 0; i < take; i++) {

	1550 if (topprob < prob[i]) {

	1551 topprob = prob[i];

	1552 toprankenc = next_prob_sub + i;

	1553 }

	1554 }

	1555 prob += take;

	1556 next_prob_sub += take;

	1557 }

	1558 }

	1559 return toprankenc;

	1560 }

	1561

	1562

	1563 // Find subscript of matching key in first 8 bytes of sorted hint array, or -1

	1564 int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize,

	1565 const char* norm_key) {

	1566 // Key is always in range [lo..hi)

	1567 int lo = 0;

	1568 int hi = hintprobssize;

	1569 while (lo < hi) {

	1570 int mid = (lo + hi) >> 1;

	1571 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8);

	1572 if (comp < 0) {

	1573 lo = mid + 1;

	1574 } else if (comp > 0) {

	1575 hi = mid;

	1576 } else {

	1577 return mid;

	1578 }

	1579 }

	1580 return -1;

	1581 }

	1582

	1583 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1

	1584 int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,

	1585 const char* norm_key) {

	1586 // Key is always in range [lo..hi)

	1587 int lo = 0;

	1588 int hi = hintprobssize;

	1589 while (lo < hi) {

	1590 int mid = (lo + hi) >> 1;

	1591 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4);

	1592 if (comp < 0) {

	1593 lo = mid + 1;

	1594 } else if (comp > 0) {

	1595 hi = mid;

	1596 } else {

	1597 return mid;

	1598 }

	1599 }

	1600 return -1;

	1601 }

	1602

	1603 static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) {

	1604 destatep->enc_prob[r_enc] += boost;

	1605 }

	1606

	1607 static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) {

	1608 destatep->enc_prob[r_enc] -= whack;

	1609 }

	1610

	1611 // Apply initial probability hint based on top level domain name

	1612 // Weight is 0..100 percent

	1613 // Return 1 if name match found

	1614 int ApplyTldHint(const char* url_tld_hint, int weight,

	1615 DetectEncodingState* destatep) {

	1616 if (url_tld_hint[0] == '~') {

	1617 return 0;

	1618 }

	1619 string normalized_tld = MakeChar4(string(url_tld_hint));

	1620 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,

	1621 normalized_tld.c_str());

	1622 if (n >= 0) {

	1623 // TLD is four bytes, probability table is ~12 bytes

	1624 int best_sub = ApplyCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],

	1625 kMaxTldVector, weight, destatep);

	1626 // Never boost ASCII7; do CP1252 instead

	1627 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}

	1628 destatep->declared_enc_1 = best_sub;

	1629 if (destatep->debug_data != NULL) {

	1630 // Show TLD hint

	1631 SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint);

	1632 }

	1633 return 1;

	1634 }

	1635 return 0;

	1636 }

	1637

	1638 // Apply initial probability hint based on charset= name

	1639 // Weight is 0..100 percent

	1640 // Return 1 if name match found

	1641 int ApplyCharsetHint(const char* charset_hint, int weight,

	1642 DetectEncodingState* destatep) {

	1643 if (charset_hint[0] == '~') {

	1644 return 0;

	1645 }

	1646 string normalized_charset = MakeChar44(string(charset_hint));

	1647 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,

	1648 normalized_charset.c_str());

	1649 if (n >= 0) {

	1650 // Charset is eight bytes, probability table is ~eight bytes

	1651 int best_sub = ApplyCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharse tKey],

	1652 kMaxCharsetVector, weight, destatep);

	1653 // Never boost ASCII7; do CP1252 instead

	1654 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}

	1655 destatep->declared_enc_1 = best_sub;

	1656

	1657 // If first explicitly declared charset is confusable with Latin1/1252, put

	1658 // both declared forms in declared_enc_*, displacing Latin1/1252.

	1659 // This avoids a bit of Latin1 creep.

	1660 // Also boost the declared encoding and its pair

	1661 // TODO (dsites) This should all be folded into postproc-enc-detect.cc

	1662 if ((destatep->http_hint == UNKNOWN_ENCODING) &&

	1663 (destatep->meta_hint == UNKNOWN_ENCODING)) {

	1664 // This is the first charset=hint

	1665 switch (best_sub) {

	1666 case F_Latin2: // 8859-2 Latin2, east euro

	1667 destatep->declared_enc_2 = F_CP1250;

	1668 Boost(destatep, F_Latin2, kGentleOnePair);

	1669 Boost(destatep, F_CP1250, kGentleOnePair);

	1670 break;

	1671 case F_CP1250:

	1672 destatep->declared_enc_2 = F_Latin2;

	1673 Boost(destatep, F_Latin2, kGentleOnePair);

	1674 Boost(destatep, F_CP1250, kGentleOnePair);

	1675 break;

	1676

	1677 case F_Latin3: // 8859-3 Latin3, south euro, Esperanto

	1678 destatep->declared_enc_2 = F_ASCII_7_bit;

	1679 Boost(destatep, F_Latin3, kGentleOnePair);

	1680 break;

	1681

	1682 case F_Latin4: // 8859-4 Latin4, north euro

	1683 destatep->declared_enc_2 = F_ASCII_7_bit;

	1684 Boost(destatep, F_Latin4, kGentleOnePair);

	1685 break;

	1686

	1687 case F_ISO_8859_5: // 8859-5 Cyrillic

	1688 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1251

	1689 Boost(destatep, F_ISO_8859_5, kGentleOnePair); // (too different)

	1690 break;

	1691 case F_CP1251:

	1692 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost -5

	1693 Boost(destatep, F_CP1251, kGentleOnePair); // (too different)

	1694 break;

	1695

	1696 case F_Arabic: // 8859-6 Arabic

	1697 destatep->declared_enc_2 = F_CP1256;

	1698 Boost(destatep, F_Arabic, kGentleOnePair);

	1699 Boost(destatep, F_CP1256, kGentleOnePair);

	1700 break;

	1701 case F_CP1256:

	1702 destatep->declared_enc_2 = F_Arabic;

	1703 Boost(destatep, F_Arabic, kGentleOnePair);

	1704 Boost(destatep, F_CP1256, kGentleOnePair);

	1705 break;

	1706

	1707 case F_Greek: // 8859-7 Greek

	1708 destatep->declared_enc_2 = F_CP1253;

	1709 Boost(destatep, F_Greek, kGentleOnePair);

	1710 Boost(destatep, F_CP1253, kGentleOnePair);

	1711 break;

	1712 case F_CP1253:

	1713 destatep->declared_enc_2 = F_Greek;

	1714 Boost(destatep, F_Greek, kGentleOnePair);

	1715 Boost(destatep, F_CP1253, kGentleOnePair);

	1716 break;

	1717

	1718 case F_Hebrew: // 8859-8 Hebrew

	1719 destatep->declared_enc_2 = F_CP1255;

	1720 Boost(destatep, F_Hebrew, kGentleOnePair);

	1721 Boost(destatep, F_CP1255, kGentleOnePair);

	1722 break;

	1723 case F_CP1255:

	1724 destatep->declared_enc_2 = F_Hebrew;

	1725 Boost(destatep, F_Hebrew, kGentleOnePair);

	1726 Boost(destatep, F_CP1255, kGentleOnePair);

	1727 break;

	1728

	1729 case F_Latin5: // 8859-9 Latin5, Turkish

	1730 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1254

	1731 Boost(destatep, F_Latin5, kGentleOnePair); // (too different)

	1732 break;

	1733 case F_CP1254:

	1734 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost Latin5

	1735 Boost(destatep, F_CP1254, kGentleOnePair); // (too different)

	1736 break;

	1737

	1738 case F_Latin6: // 8859-10 Latin6, Nordic

	1739 destatep->declared_enc_2 = F_ASCII_7_bit;

	1740 Boost(destatep, F_Latin6, kGentleOnePair);

	1741 break;

	1742

	1743 case F_ISO_8859_11: // 8859-11 Thai,

	1744 destatep->declared_enc_2 = F_CP874;

	1745 Boost(destatep, F_ISO_8859_11, kGentleOnePair);

	1746 Boost(destatep, F_CP874, kGentleOnePair);

	1747 break;

	1748 case F_CP874:

	1749 destatep->declared_enc_2 = F_ISO_8859_11;

	1750 Boost(destatep, F_ISO_8859_11, kGentleOnePair);

	1751 Boost(destatep, F_CP874, kGentleOnePair);

	1752 break;

	1753

	1754 case F_ISO_8859_13: // 8859-13 Latin7, Baltic

	1755 destatep->declared_enc_2 = F_CP1257;

	1756 Boost(destatep, F_ISO_8859_13, kGentleOnePair);

	1757 Boost(destatep, F_CP1257, kGentleOnePair);

	1758 break;

	1759 case F_CP1257:

	1760 destatep->declared_enc_2 = F_ISO_8859_13;

	1761 Boost(destatep, F_ISO_8859_13, kGentleOnePair);

	1762 Boost(destatep, F_CP1257, kGentleOnePair);

	1763 break;

	1764

	1765 case F_ISO_8859_15: // 8859-15 Latin9, Latin0, Euro-ized Latin1

	1766 destatep->declared_enc_2 = F_ASCII_7_bit;

	1767 Boost(destatep, F_ISO_8859_15, kGentleOnePair);

	1768 break;

	1769

	1770

	1771 // Greek all-caps is confusable with KOI8x all-lower and Hebrew.

	1772 // This turns some Greek documents into Cyrillic, etc. by mistake.

	1773 // Greek and Hebrew are boosted explicitly above; do KOI8x here.

	1774 // Boosting the declared encodingmakes it harder for the wrong one to

	1775 // creep up.

	1776 case F_KOI8R:

	1777 Boost(destatep, F_KOI8R, kGentleOnePair);

	1778 break;

	1779 case F_KOI8U:

	1780 Boost(destatep, F_KOI8U, kGentleOnePair);

	1781 break;

	1782

	1783 default:

	1784 break;

	1785 }

	1786 }

	1787

	1788 if (destatep->debug_data != NULL) {

	1789 // Show charset hint

	1790 SetDetailsEncProb(destatep, 0, best_sub, charset_hint);

	1791 }

	1792

	1793 //

	1794 // Some fix-ups for the declared encodings

	1795 //

	1796

	1797 // If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos

	1798 // TODO (dsites) This should all be folded into postproc-enc-detect.cc

	1799 if ((best_sub != F_UTF8) &&

	1800 (best_sub != F_Latin1) &&

	1801 (best_sub != F_CP1252)) {

	1802 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4); // demote

	1803 }

	1804

	1805 // Latin2 and CP1250 differ in the overlap part, such as B1 or B9

	1806 // The initial probabilites for charset=Latin2 explicitly put CP1250

	1807 // down twice as far as normal, and vice versa. This is done in

	1808 // postproc-enc-detect.cc

	1809

	1810 // If charset=user-defined, treat as Binary --

	1811 // we can safely only do low ASCII, might be Indic

	1812 if (normalized_charset.substr(0,4) == "user") {

	1813 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	1814 }

	1815

	1816 return 1;

	1817 }

	1818 return 0;

	1819 }

	1820

	1821 // Apply initial probability hint based on caller-supplied encoding

	1822 // Negative hint whacks ~encoding, non-negative boosts encoding

	1823 //

	1824 // Negative hints are an experiment to see if they might be useful.

	1825 // Not operator used instead of unary minus to allow specifying not-zero

	1826 int ApplyEncodingHint(const int encoding_hint, int weight,

	1827 DetectEncodingState* destatep) {

	1828 Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ?

	1829 ~encoding_hint : encoding_hint);

	1830 // Map to the right internal subscript

	1831 int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint);

	1832

	1833 // I'm not sure how strong this hint should be. Weight 100% = 1 bigram

	1834 int increment = (kBoostOnePair * weight) / 100;

	1835

	1836 if (encoding_hint < 0) {

	1837 destatep->enc_prob[rankedenc_hint] -= increment;

	1838 } else {

	1839 destatep->enc_prob[rankedenc_hint] += increment;

	1840 }

	1841

	1842 if (destatep->debug_data != NULL) {

	1843 // Show encoding hint

	1844 SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint));

	1845 }

	1846 return 1;

	1847 }

	1848

	1849 // Apply initial probability hint based on user interface language

	1850 // Weight is 0..100 percent

	1851 // Return 1 if name match found

	1852 int ApplyUILangaugeHint(const Language language_hint,

	1853 int weight, DetectEncodingState* destatep) {

	1854 if (language_hint == UNKNOWN_LANGUAGE) {

	1855 return 0;

	1856 }

	1857 string normalized_lang = MakeChar8(LanguageName(language_hint));

	1858 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,

	1859 normalized_lang.c_str());

	1860 if (n >= 0) {

	1861 // Language is eight bytes, probability table is ~eight bytes

	1862 int best_sub = ApplyCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],

	1863 kMaxLangVector, weight, destatep);

	1864 // Never boost ASCII7; do CP1252 instead

	1865 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}

	1866 destatep->declared_enc_1 = best_sub;

	1867 if (destatep->debug_data != NULL) {

	1868 // Show language hint

	1869 SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str());

	1870 }

	1871 return 1;

	1872 }

	1873 return 0;

	1874 }

	1875

	1876 // Apply initial probability hint based on corpus type (web, email, etc)

	1877 // Weight is 0..100 percent IGNORED

	1878 // Return 1 if name match found

	1879 int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,

	1880 int weight, DetectEncodingState* destatep) {

	1881

	1882 for (int i = 0; i < NUM_RANKEDENCODING; i++) {

	1883 // Set the default probability

	1884 destatep->enc_prob[i] = kDefaultProb[i] * 3;

	1885 // Deliberately set 2022 seven-bit encodings to zero,

	1886 // so we can look for actual use

	1887 // TODO (dsites) This should all be folded into postproc-enc-detect.cc

	1888 if (SevenBitEncoding(kMapToEncoding[i])) {

	1889 destatep->enc_prob[i] = 0;

	1890 }

	1891 }

	1892

	1893 // A little corpus distinction

	1894 switch (corpus_type) {

	1895 case CompactEncDet::WEB_CORPUS:

	1896 case CompactEncDet::XML_CORPUS:

	1897 // Allow double-converted UTF-8 to start nearly equal to normal UTF-8

	1898 destatep->enc_prob[F_UTF8UTF8] =

	1899 destatep->enc_prob[F_UTF8] - kSmallInitDiff;

	1900 break;

	1901 case CompactEncDet::QUERY_CORPUS:

	1902 case CompactEncDet::EMAIL_CORPUS:

	1903 default:

	1904 break;

	1905 }

	1906

	1907 if (FLAGS_demo_nodefault) {

	1908 // Demo, make initial probs all zero

	1909 for (int i = 0; i < NUM_RANKEDENCODING; i++) {

	1910 destatep->enc_prob[i] = 0;

	1911 }

	1912 }

	1913

	1914 if (destatep->debug_data != NULL) {

	1915 // Show default hint

	1916 SetDetailsEncProb(destatep, 0, -1, "Default");

	1917 }

	1918 return 1;

	1919 }

	1920

	1921

	1922

	1923 // Do reverse search for c in [str..str+len)

	1924 // Note: initial pointer is to FRONT of string, not back

	1925 const char* MyMemrchr(const char* str, char c, size_t len) {

	1926 const char* ret = str + len;

	1927 while (str <= --ret) {

	1928 if (*ret == c) {return ret;}

	1929 }

	1930 return NULL;

	1931 }

	1932

	1933

	1934 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD

	1935 // Now that we are no longer trying to do Indic font-based encodigns, we

	1936 // don't need the full URL and can go back to simple TLD. This test remains for

	1937 // backwards compatility with any caller using full URL.

	1938 static const int kMinURLLength = 11;

	1939

	1940 // Extract TLD from a full URL or just a TLD

	1941 // Return hostname and length if a full URL

	1942 void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len,

	1943 const char** ret_host_start, int* ret_host_len) {

	1944 // url_hint can either be a full URL (preferred) or just top-level domain name

	1945 // Extract the TLD from a full URL and use it for

	1946 // a normal TLD hint

	1947

	1948 strncpy(tld_hint, "~", tld_hint_len);

	1949 tld_hint[tld_hint_len - 1] = '\0';

	1950 *ret_host_start = NULL;

	1951 *ret_host_len = 0;

	1952

	1953 int url_len = (url_hint != NULL) ? strlen(url_hint) : 0;

	1954 if (url_len == 0) {

	1955 // Empty TLD

	1956 return;

	1957 }

	1958

	1959 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD

	1960 if (kMinURLLength <= url_len) {

	1961 // See if it really is a URL

	1962 const char* first_slash = strchr(url_hint, '/');

	1963 if ((first_slash != NULL) && (first_slash != url_hint) &&

	1964 (first_slash[-1] == ':') && (first_slash[1] == '/') &&

	1965 (memrchr(url_hint, '.', first_slash - url_hint) == NULL)) {

	1966 // We found :// and no dot in front of it, so declare a real URL

	1967

	1968 const char* hostname_start = first_slash + 2;

	1969 const char* hostname_end = strchr(hostname_start, '/');

	1970 if (hostname_end == NULL) {

	1971 // No slash; end is first byte off end of the URL string

	1972 hostname_end = url_hint + url_len;

	1973 }

	1974 size_t hostname_len = hostname_end - hostname_start;

	1975 const char* port_start =

	1976 (const char*)memchr(hostname_start, ':', hostname_len);

	1977 if (port_start != NULL) {

	1978 // Port; shorten hostname

	1979 hostname_end = port_start;

	1980 hostname_len = hostname_end - hostname_start;

	1981 }

	1982

	1983 const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len);

	1984 if (tld_start != NULL) {

	1985 // Remember the TLD we just found

	1986 int tld_len = hostname_start + hostname_len - tld_start - 1;

	1987 if (tld_len > (tld_hint_len - 1)) {

	1988 tld_len = tld_hint_len - 1;

	1989 }

	1990 memcpy(tld_hint, tld_start + 1, tld_len);

	1991 tld_hint[tld_len] = '\0';

	1992 }

	1993 *ret_host_start = hostname_start;

	1994 *ret_host_len = hostname_len;

	1995 return;

	1996 }

	1997 } else {

	1998 strncpy(tld_hint, url_hint, tld_hint_len);

	1999 tld_hint[tld_hint_len - 1] = '\0';

	2000 }

	2001 }

	2002

	2003 // Apply hints, if any, to probabilities

	2004 // NOTE: Encoding probabilites are all zero at this point

	2005 void ApplyHints(const char* url_hint,

	2006 const char* http_charset_hint,

	2007 const char* meta_charset_hint,

	2008 const int encoding_hint,

	2009 const Language language_hint,

	2010 const CompactEncDet::TextCorpusType corpus_type,

	2011 DetectEncodingState* destatep) {

	2012 int hint_count = 0;

	2013 // url_hint can either be a full URL (preferred) or just top-level domain name

	2014 // Extract the TLD from a full URL and use it for

	2015 // a normal TLD hint

	2016

	2017 char tld_hint[16];

	2018 const char* hostname_start = NULL;

	2019 int hostname_len = 0;

	2020 ExtractTLD(url_hint, tld_hint, sizeof(tld_hint),

	2021 &hostname_start, &hostname_len);

	2022

	2023

	2024 // Initial hints give slight boost to Ascii-7-bit and code page 1252

	2025 // ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1

	2026 // This gives a boost to 1252 if one of HTTP/META is specified,

	2027 // but this could be the wrong thing to do if Latin2/3/4/etc. is specified

	2028 destatep->declared_enc_1 = F_CP1252;

	2029 destatep->declared_enc_2 = F_ASCII_7_bit;

	2030

	2031 // Applying various hints takes max of new hint and any old hint.

	2032 // This does better on multiple hints that a weighted average

	2033

	2034 // Weight is 0..100 percent

	2035 if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) {

	2036 destatep->declared_enc_2 = destatep->declared_enc_1;

	2037 hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep);

	2038 destatep->http_hint = kMapToEncoding[destatep->declared_enc_1];

	2039 if ((destatep->declared_enc_1 == F_CP1252) \|\|

	2040 (destatep->declared_enc_1 == F_Latin1)) {

	2041 destatep->looking_for_latin_trigrams = true;

	2042 }

	2043 }

	2044 if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) {

	2045 destatep->declared_enc_2 = destatep->declared_enc_1;

	2046 hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep);

	2047 destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1];

	2048 if ((destatep->declared_enc_1 == F_CP1252) \|\|

	2049 (destatep->declared_enc_1 == F_Latin1)) {

	2050 destatep->looking_for_latin_trigrams = true;

	2051 }

	2052 }

	2053 if (encoding_hint != UNKNOWN_ENCODING) {

	2054 destatep->declared_enc_2 = destatep->declared_enc_1;

	2055 hint_count += ApplyEncodingHint(encoding_hint, 50, destatep);

	2056 }

	2057 if (language_hint != UNKNOWN_LANGUAGE) {

	2058 destatep->declared_enc_2 = destatep->declared_enc_1;

	2059 hint_count += ApplyUILangaugeHint(language_hint, 50, destatep);

	2060 }

	2061 // Use top level domain if not .com and <=1 other hint was available

	2062 if (url_hint != NULL) {

	2063 destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint);

	2064 if (hint_count == 0) {

	2065 // Apply with weight 100%

	2066 destatep->declared_enc_2 = destatep->declared_enc_1;

	2067 hint_count += ApplyTldHint(tld_hint, 100, destatep);

	2068 if ((destatep->declared_enc_1 == F_CP1252) \|\|

	2069 (destatep->declared_enc_1 == F_Latin1)) {

	2070 destatep->looking_for_latin_trigrams = true;

	2071 }

	2072 if (strcmp("hu", tld_hint) == 0) {

	2073 // Hungarian is particularly difficult to separate Latin2 from Latin1,

	2074 // so always look for trigram scanning if bare TLD=hu hint

	2075 destatep->looking_for_latin_trigrams = true;

	2076 }

	2077 // Treat .com as no TLD hint at all

	2078 } else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) {

	2079 // Either shift weighting or consider doing no TLD here -- seems to

	2080 // distract from correct charset= hints. Or perhaps apply only if

	2081 // charset = Latin1/1252...

	2082 // Apply with weight 50%

	2083 destatep->declared_enc_2 = destatep->declared_enc_1;

	2084 hint_count += ApplyTldHint(tld_hint, 50, destatep);

	2085 if ((destatep->declared_enc_1 == F_CP1252) \|\|

	2086 (destatep->declared_enc_1 == F_Latin1)) {

	2087 destatep->looking_for_latin_trigrams = true; // These need trigrams

	2088 }

	2089 }

	2090 // Else ignore TLD hint entirely

	2091 }

	2092

	2093 // Use all-web default distribution if not even a TLD hint

	2094 if (hint_count == 0) {

	2095 destatep->looking_for_latin_trigrams = true; // Default needs trigrams

	2096 destatep->declared_enc_2 = destatep->declared_enc_1;

	2097 hint_count += ApplyDefaultHint(corpus_type, 100, destatep);

	2098 }

	2099

	2100

	2101 // ISO-Microsoft Pairs

	2102 // F_Latin1, F_CP1252,

	2103 // F_Latin2, F_CP1250, NOT really strict subset/superset pairs

	2104 // F_Latin3,

	2105 // F_Latin4,

	2106 // F_ISO_8859_5, F_CP1251,

	2107 // F_Arabic, F_CP1256, NOT

	2108 // F_Greek, F_CP1253, NOT really pairs

	2109 // (or upgrade incvt to make Greek use CP)

	2110 // F_Hebrew, F_CP1255, NOT really pairs

	2111 // F_Latin5, F_CP1254,

	2112 // F_Latin6,

	2113 // F_ISO_8859_11,

	2114 // F_ISO_8859_13, F_CP1257,

	2115 // F_ISO_8859_15,

	2116 // ISO-Microsoft Pairs

	2117

	2118 // Get important families started together

	2119 // // This should fall out of the initializatoin vectors for charset,

	2120 // but we need to get rid of families alltogetrher

	2121 //

	2122 // TODO make this more graceful

	2123

	2124 // Add small bias for subsets

	2125

	2126 // Subtract small bias for supersets

	2127 destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff;

	2128

	2129 destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff;

	2130 destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff;

	2131

	2132 destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] -

	2133 kSmallInitDiff;

	2134 destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] -

	2135 kSmallInitDiff;

	2136

	2137 // Deliberate over-bias Ascii7 and underbias Binary [unneeded]

	2138 // destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSm allInitDiff;

	2139 // destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitia l / 2);

	2140

	2141 if (destatep->debug_data != NULL) {

	2142 // Show state at end of hints

	2143 SetDetailsEncProb(destatep, 0, -1, "Endhints");

	2144 if(FLAGS_enc_detect_detail2) {

	2145 // Add a line showing the watched encoding(s)

	2146 if (watch1_rankedenc >= 0) {

	2147 SetDetailsEncProb(destatep, 0,

	2148 watch1_rankedenc, FLAGS_enc_detect_watch1.c_str());

	2149 }

	2150 if (watch2_rankedenc >= 0) {

	2151 SetDetailsEncProb(destatep, 0,

	2152 watch2_rankedenc, FLAGS_enc_detect_watch2.c_str());

	2153 }

	2154 } // End detail2

	2155 }

	2156

	2157 // If duplicate hints, set second one to ASCII_7BIT to prevent double-boost

	2158 if (destatep->declared_enc_1 == destatep->declared_enc_2) {

	2159 destatep->declared_enc_2 = F_ASCII_7_bit;

	2160 }

	2161

	2162 if (FLAGS_force127) {

	2163 destatep->do_latin_trigrams = true;

	2164 if (FLAGS_enc_detect_source) {

	2165 PsHighlight(0, destatep->initial_src, 0, 2);

	2166 }

	2167 }

	2168

	2169

	2170 if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;}

	2171 if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;}

	2172

	2173 //

	2174 // At this point, destatep->enc_prob[] is an initial probability vector based

	2175 // on the given hints/default. In general, it spreads out least-likely

	2176 // encodings to be about 2**-25 below the most-likely encoding.

	2177 // For input text with lots of bigrams, an unlikely encoding can rise to

	2178 // the top at a rate of about 26 per bigram, and more commonly 22 per

	2179 // bigram. So more than 4 bigrams and commonly more than 12 are

	2180 // needed to overcome the initial hints when the least-likely encoding

	2181 // is in fact the correct answer. So if the entire text has very few bigrams

	2182 // (as a two-word query might), it can be impossible for the correct

	2183 // encoding to win.

	2184 //

	2185 // To compensate for this, we take the initial hint vector and effectively

	2186 // apply it at the rate of 1/16 every bigram for the first 16 bigrams. The

	2187 // actual mechanism is done just before the last prune.

	2188 //

	2189

	2190 // Remember Initial hint probabilities

	2191 memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob));

	2192 }

	2193

	2194 // Look for specific high-value patterns in the first 4 bytes

	2195 // Byte order marks (BOM)

	2196 // EFBBBF UTF-8

	2197 // FEFF UTF-16 BE

	2198 // FFFE UTF-16 LE

	2199 // FFFE0000 UTF-32 BE

	2200 // 0000FEFF UTF-32 LE

	2201 //

	2202 // Likely UTF-x of seven-bit ASCII

	2203 // 00xx UTF-16 BE xx printable ASCII

	2204 // xx00 UTF-16 LE

	2205 // 000000xx UTF-32 BE

	2206 // xx000000 UTF-32 LE

	2207 //

	2208 void InitialBytesBoost(const uint8* src,

	2209 int text_length,

	2210 DetectEncodingState* destatep) {

	2211 if (text_length < 4) {return;}

	2212

	2213 char32 pair01 = (src[0] << 8) \| src[1];

	2214 char32 pair23 = (src[2] << 8) \| src[3];

	2215 char32 quad0123 = (pair01 << 16) \| pair23;

	2216

	2217 bool utf_16_indication = false;

	2218 bool utf_32_indication = false;

	2219 int best_enc = -1;

	2220

	2221 // Byte order marks

	2222 // UTF-8

	2223 if ((quad0123 & 0xffffff00) == 0xEFBBBF00) {

	2224 destatep->bom_hint = UTF8;

	2225 Boost(destatep, F_UTF8, kBoostInitial * 2);

	2226 Boost(destatep, F_UTF8UTF8, kBoostInitial * 2);

	2227 best_enc = F_UTF8;

	2228 // UTF-32 (test before UTF-16)

	2229 } else if (quad0123 == 0x0000FEFF) {

	2230 destatep->bom_hint = UTF32BE;

	2231 Boost(destatep, F_UTF_32BE, kBoostInitial * 2);

	2232 best_enc = F_UTF_32BE;

	2233 } else if (quad0123 == 0xFFFE0000) {

	2234 destatep->bom_hint = UTF32LE;

	2235 Boost(destatep, F_UTF_32LE, kBoostInitial * 2);

	2236 best_enc = F_UTF_32LE;

	2237 // UTF-16

	2238 } else if (pair01 == 0xFEFF) {

	2239 destatep->bom_hint = UTF16BE;

	2240 Boost(destatep, F_UTF_16BE, kBoostInitial * 3);

	2241 best_enc = F_UTF_16BE;

	2242 } else if (pair01 == 0xFFFE) {

	2243 destatep->bom_hint = UTF16LE;

	2244 Boost(destatep, F_UTF_16LE, kBoostInitial * 3);

	2245 best_enc = F_UTF_16LE;

	2246

	2247 // Possible seven-bit ASCII encoded as UTF-16/32

	2248 // UTF-32 (test before UTF-16)

	2249 } else if (((quad0123 & 0xffffff00) == 0) &&

	2250 (kIsPrintableAscii[src[3]] != 0)) {

	2251 Boost(destatep, F_UTF_32BE, kBoostInitial);

	2252 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal char

	2253 best_enc = F_UTF_32BE;

	2254 } else if (((quad0123 & 0x00ffffff) == 0) &&

	2255 (kIsPrintableAscii[src[0]] != 0)) {

	2256 Boost(destatep, F_UTF_32LE, kBoostInitial);

	2257 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char

	2258 best_enc = F_UTF_32LE;

	2259 } else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) {

	2260 Boost(destatep, F_UTF_16BE, kBoostInitial);

	2261 best_enc = F_UTF_16BE;

	2262 } else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) {

	2263 Boost(destatep, F_UTF_16LE, kBoostInitial);

	2264 best_enc = F_UTF_16LE;

	2265

	2266 // Whack if 0000 or FFFF

	2267 // UTF-32 (test before UTF-16)

	2268 } else if (quad0123 == 0x00000000) {

	2269 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char

	2270 Whack(destatep, F_UTF_32LE, kBadPairWhack);

	2271 Whack(destatep, F_UTF_16BE, kBadPairWhack);

	2272 Whack(destatep, F_UTF_16LE, kBadPairWhack);

	2273 best_enc = -1;

	2274 } else if (quad0123 == 0xffffffff) {

	2275 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char

	2276 Whack(destatep, F_UTF_32LE, kBadPairWhack);

	2277 Whack(destatep, F_UTF_16BE, kBadPairWhack);

	2278 Whack(destatep, F_UTF_16LE, kBadPairWhack);

	2279 best_enc = -1;

	2280 } else if (pair01 == 0x0000) {

	2281 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char

	2282 Whack(destatep, F_UTF_16LE, kBadPairWhack);

	2283 best_enc = -1;

	2284 } else if (pair01 == 0xffff) {

	2285 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char

	2286 Whack(destatep, F_UTF_16LE, kBadPairWhack);

	2287 best_enc = -1;

	2288

	2289

	2290 // These are the first four bytes of some known binary file formats

	2291

	2292 // Boost BINARY bigtime if JPEG FFD8FFxx

	2293 // Boost BINARY bigtime if png 89504E47 (.PNG)

	2294 // Boost BINARY bigtime if gif 47494638 (GIF8)

	2295 // Boost BINARY bigtime if zip 504B0304 (PK..)

	2296 // Boost BINARY bigtime if gzip 1F8B08xx

	2297 // Boost BINARY bigtime if gzip 78DAxxxx

	2298 // Boost BINARY if PDF 25504446 (%PDF)

	2299 // Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f)

	2300 } else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) { // JPEG FFD8FFxx

	2301 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2302 } else if (quad0123 == 0x89504E47) { // Hex 89 P N G

	2303 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2304 } else if (quad0123 == 0x47494638) { // Hex GIF8

	2305 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2306 } else if (quad0123 == 0x504B0304) { // Hex P K 03 04

	2307 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2308 } else if ((quad0123 & 0xffffff00) == 0x1F8B0800) { // gzip 1F8B08xx

	2309 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2310 } else if (pair01 == 0x78DA) { // gzip 78DAxxxx

	2311 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2312 } else if (quad0123 == 0x25504446) { // Hex %PDF

	2313 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2314 } else if ((quad0123 & 0xffffff1f) == 0x66535700) { // Hex FWSx

	2315 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2316 } else if ((quad0123 & 0xffffff1f) == 0x63535700) { // Hex CWSx

	2317 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2318

	2319 // More binary detect prefixes

	2320 // 7F E L F Executable and linking format

	2321 // M M 00 * TIFF (little-endian)

	2322 // * 00 M M TIFF (big-endian)

	2323 // 01 f c p Final cut pro

	2324 } else if (quad0123 == 0x7F454C46) { // Hex 7F E L F

	2325 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2326 } else if (quad0123 == 0x4D4D002A) { // Hex M M 00 *

	2327 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2328 } else if (quad0123 == 0x2A004D4D) { // Hex * 00 M M

	2329 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2330 } else if (quad0123 == 0x01666370) { // Hex 01 f c p

	2331 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2332

	2333 // More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII

	2334 // prefix overcoming binary

	2335 // C C S D USGS ISIS 3-D cube files

	2336 // S I M P FITS image header "SIMPLE "

	2337 } else if (quad0123 == 0x43435344) { // Hex C C S D

	2338 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2339 } else if (quad0123 == 0x53494D50) { // Hex S I M P

	2340 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2341

	2342 // More binary detect prefixes; all-ASCII names; lighter weight

	2343 // H W P Hangul word processor

	2344 // 8 B P S Photoshop

	2345 // P D S _ xx "PDS_VERSION_ID "

	2346 } else if (quad0123 == 0x48575020) { // Hex H W P

	2347 if ((19 <= text_length) &&

	2348 (memcmp(src, "HWP.Document.File.V", 19) == 0)) {

	2349 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2350 } else if ((19 <= text_length) &&

	2351 (memcmp(src, "HWP Document File V", 19) == 0)) {

	2352 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2353 } else {

	2354 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);

	2355 }

	2356 } else if (quad0123 == 0x38425053) { // Hex 8 B P S

	2357 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2358 } else if (quad0123 == 0x5044535F) { // Hex P D S _

	2359 if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) {

	2360 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);

	2361 } else {

	2362 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);

	2363 }

	2364 }

	2365

	2366 // There are several main Windows EXE file formats.

	2367 // Not examined here (prefix too short; never see them in Google pipeline)

	2368 // M Z DOS .exe Mark Zbikowski

	2369 // N E DOS 4.0 16-bit

	2370 // L E OS/2 VxD drivers

	2371 // L X OS/2

	2372 // P E Windows NT

	2373

	2374

	2375 // More user-defined

	2376 // http://www.freenet.am/armscii/ Armenian

	2377

	2378 // If any hints or BOM, etc. keep UTF 16/32 around

	2379 if ((destatep->enc_prob[F_UTF_16BE] > 0) \|\|

	2380 (destatep->enc_prob[F_UTF_16LE] > 0)) {

	2381 utf_16_indication = true;

	2382 }

	2383 if ((destatep->enc_prob[F_UTF_32BE] > 0) \|\|

	2384 (destatep->enc_prob[F_UTF_32LE] > 0)) {

	2385 utf_32_indication = true;

	2386 }

	2387

	2388

	2389 // Kill UTF16/32 right now if no positive indication of them

	2390 // Otherwise, they tend to rise to the top in 7-bit files with an

	2391 // occasional 0x02 byte in some comment or javascript

	2392 if (!utf_16_indication) {

	2393 Whack(destatep, F_UTF_16BE, kBadPairWhack * 8);

	2394 Whack(destatep, F_UTF_16LE, kBadPairWhack * 8);

	2395 Whack(destatep, F_Unicode, kBadPairWhack * 8);

	2396 }

	2397 if (!utf_32_indication) {

	2398 Whack(destatep, F_UTF_32BE, kBadPairWhack * 8);

	2399 Whack(destatep, F_UTF_32LE, kBadPairWhack * 8);

	2400 }

	2401

	2402 // Usually kill mixed encodings

	2403 if (!FLAGS_ced_allow_utf8utf8) {

	2404 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);

	2405 }

	2406 // 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead

	2407 Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8);

	2408

	2409 if (destatep->debug_data != NULL) {

	2410 // Show first four bytes of the input

	2411 char buff[16];

	2412 snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23);

	2413 SetDetailsEncProb(destatep, 0, best_enc, buff);

	2414 }

	2415 }

	2416

	2417

	2418

	2419 // Descending order

	2420 int IntCompare(const void* v1, const void* v2) {

	2421 const int* p1 = reinterpret_cast<const int*>(v1);

	2422 const int* p2 = reinterpret_cast<const int*>(v2);

	2423 if (p1 < p2) {return 1;}

	2424 if (p1 > p2) {return -1;}

	2425 return 0;

	2426 }

	2427

	2428 bool Base64Char(uint8 c) {

	2429 if (('A' <= c) && (c <= 'Z')) {return true;}

	2430 if (('a' <= c) && (c <= 'z')) {return true;}

	2431 if (('0' <= c) && (c <= '9')) {return true;}

	2432 if ('+' == c) {return true;}

	2433 if ('/' == c) {return true;}

	2434 return false;

	2435 }

	2436

	2437 int Base64ScanLen(const uint8* start, const uint8* limit) {

	2438 // We have a plausible beginning; scan entire base64 string

	2439 const uint8* ib64str = start;

	2440 const uint8* b64str = ib64str;

	2441 const uint8* b64strlimit = limit;

	2442 // if starts with + +++, assume it is drawing, so bogus

	2443 if (((limit - start) > 3) && (start[0] == '+') &&

	2444 (start[1] == '+') && (start[2] == '+')) {

	2445 return 81;

	2446 }

	2447 // Scan over base64

	2448 while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0)) {

	2449 }

	2450 b64str--; // We overshot by 1

	2451 return b64str - ib64str;

	2452 }

	2453

	2454 // Input is at least 8-character legal base64 string after +.

	2455 // But might be say + "Presse+Termine"

	2456 bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) {

	2457 // Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64)

	2458 // Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64)

	2459 // Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64)

	2460 // Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64)

	2461 // NOTE: this requires at least one lower AND one upper AND one digit to pass

	2462 //

	2463 int plus_count = 0;

	2464 int lower_count = 0;

	2465 int upper_count = 0;

	2466 int digit_count = 0;

	2467 int len = limit - start;

	2468 for (const uint8* src = start; src < limit; ++src) {

	2469 uint8 c = *src;

	2470 if (('a' <= c) && (c <= 'z')) {

	2471 ++lower_count;

	2472 } else if (('A' <= c) && (c <= 'Z')) {

	2473 ++upper_count;

	2474 } else if (('0' <= c) && (c <= '0')) {

	2475 ++digit_count;

	2476 } else if (*src == '+') {

	2477 ++plus_count;

	2478 }

	2479 }

	2480

	2481 if (plus_count > (1 + (len >> 4))) {return false;}

	2482 if (lower_count < (1 + (len >> 4))) {return false;}

	2483 if (upper_count < (1 + (len >> 4))) {return false;}

	2484 if (digit_count < (1 + (len >> 5))) {return false;}

	2485

	2486 // checking the last character to reduce false positive

	2487 // since the last character may be padded to 0 bits at the end.

	2488 // refer to http://en.wikipedia.org/wiki/UTF-7

	2489 int nmod8 = len & 7;

	2490 const uint8 last = *(start+len-1);

	2491 // When UTF-7 string length%8=3, the last two bits must be padded as 0

	2492 if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;}

	2493 // When UTF-7 string length%8=6, the last four bits must be padded as 0

	2494 if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;}

	2495 return true;

	2496 }

	2497

	2498 // Prune here after N bytes

	2499 // Boost here for seven-bit sequences (at every prune)

	2500 // if (sevenbitrankedencoding)

	2501 // + UTF7 scan and boost/demote len mod 8 = 0 3 6

	2502 // ~ Hz scan and boost/demote len mod 8 = 0 2 4 6

	2503 // 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6

	2504 // 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6

	2505 // [0F 2022 boost/demote]

	2506 // 00 UTF16/32 scan and boost/demote offset = even/odd

	2507 //

	2508 // If still some seven-bit possibilities > pure ASCII,

	2509 // scan each possibility for clearer prob, s.t. about

	2510 // two good sequences is a clear win

	2511 // A-Z 00-19 00xx-64xx (B = 04xx)

	2512 // a-z 1A-33 68xx-CCxx (f = 7Cxx)

	2513 // 0-9 34-3D D0xx-F4xx (1 = D4xx)

	2514 // + 3E F8xx

	2515 // / 3F FCxx

	2516 // do another chunk with slow scan

	2517

	2518

	2519 // Boost, whack, or leave alone UTF-7 probablilty

	2520 void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {

	2521 int off = destatep->interesting_offsets[AsciiPair][next_pair];

	2522 if (off >= destatep->prior_utf7_offset) {

	2523 // Not part of a previous successful UTF-7 string

	2524 ++destatep->utf7_starts;

	2525

	2526 if (byte2 == '-') {

	2527 // +- encoding for '+' neutral

	2528 } else if (!Base64Char(byte2)) {

	2529 // Not base64 -- not UTF-7, whack

	2530 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal pair

	2531 } else {

	2532 // Starts with base64 byte, might be a good UTF7 sequence

	2533 const uint8* start = destatep->initial_src + off + 1; // over the +

	2534 int n = Base64ScanLen(start, destatep->limit_src);

	2535 int nmod8 = n & 7;

	2536 if ((n == 3) \|\| (n == 6)) {

	2537 // short but legal -- treat as neutral

	2538 } else if ((nmod8 == 0) \| (nmod8 == 3) \| (nmod8 == 6)) {

	2539 // Good length. Check for good Unicode.

	2540 if (GoodUnicodeFromBase64(start, start + n)) {

	2541 // Good length and Unicode, boost

	2542 Boost(destatep, F_UTF7, kBoostOnePair); // Found good

	2543 destatep->prior_utf7_offset = off + n + 1;

	2544 } else {

	2545 // Bad Unicode. Whack

	2546 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length

	2547 }

	2548 } else {

	2549 // Bad length. Whack

	2550 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length

	2551 }

	2552 }

	2553 }

	2554 }

	2555

	2556 // Boost, whack, or leave alone HZ probablilty

	2557 void HzBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {

	2558 if ((byte2 == '{') \|\| (byte2 == '}')) {

	2559 Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~}

	2560 } else if ((byte2 == '~') \|\| (byte2 == '\n')) {

	2561 destatep->enc_prob[F_HZ_GB_2312] += 0; // neutral

	2562 } else {

	2563 Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair

	2564 }

	2565 }

	2566

	2567 // Boost, whack, or leave alone BINARY probablilty

	2568 void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {

	2569 int quadrant = ((byte1 & 0x80) >> 6) \| ((byte2 & 0x80) >> 7);

	2570 int bucket8x4 = ((byte1 & 0xe0) >> 3) \| ((byte2 & 0xc0) >> 6);

	2571 uint32 quad_mask = 1 << quadrant;

	2572 uint32 bucket8x4_mask = 1 << bucket8x4;

	2573 if ((destatep->binary_quadrants_seen & quad_mask) == 0) {

	2574 destatep->binary_quadrants_seen \|= quad_mask;

	2575 destatep->binary_quadrants_count += 1;

	2576 if (destatep->binary_quadrants_count == 4) {

	2577 Boost(destatep, F_BINARY, kBoostOnePair * 2); // Found all 4 quadrants,

	2578 // boost 2 pairs

	2579 }

	2580 }

	2581 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {

	2582 destatep->binary_8x4_seen \|= bucket8x4_mask;

	2583 destatep->binary_8x4_count += 1;

	2584 if (destatep->binary_8x4_count >= 11) {

	2585 Boost(destatep, F_BINARY, kBoostOnePair * 4); // Found 11+/20 buckets,

	2586 // boost 4 pairs each time

	2587 }

	2588 }

	2589 }

	2590

	2591

	2592 // Demote UTF-16/32 on 0000 or FFFF, favoring Binary

	2593 void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) {

	2594 if (byte1 == 0) { // We have 0000

	2595 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair

	2596 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair

	2597 switch (offset & 3) {

	2598 case 0: // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE

	2599 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair

	2600 Boost(destatep, F_UTF_32BE, kSmallInitDiff); // Good pair

	2601 break;

	2602 case 1: // We get called with 1 5 9, etc. for ASCII as UTF-32LE

	2603 case 2: // We get called with 2 6 10, etc. for BMP as UTF-32LE

	2604 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair

	2605 Boost(destatep, F_UTF_32LE, kSmallInitDiff); // Good pair

	2606 break;

	2607 case 3: // ambiguous

	2608 break;

	2609 }

	2610 } else { // We have ffff

	2611 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair

	2612 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair

	2613 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair

	2614 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair

	2615 }

	2616 }

	2617

	2618 // Make even offset

	2619 void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) {

	2620 destatep->interesting_offsets[OtherPair][next_pair] &= ~1;

	2621 }

	2622

	2623 bool ConsecutivePair(DetectEncodingState* destatep, int i) {

	2624 if (i <= 0) {

	2625 return false;

	2626 }

	2627 return destatep->interesting_offsets[OtherPair][i] ==

	2628 (destatep->interesting_offsets[OtherPair][i - 1] + 2);

	2629 }

	2630

	2631 // boost, whack, or leave alone UTF-8 probablilty

	2632 // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8

	2633 // Returns total boost

	2634 int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) {

	2635 int startcount = destatep->prior_interesting_pair[OtherPair];

	2636 int endcount = destatep->next_interesting_pair[OtherPair];

	2637

	2638 int demotion_count = 0;

	2639 for (int i = startcount; i < endcount; ++i) {

	2640 int sub;

	2641 char* s = &destatep->interesting_pairs[OtherPair][i * 2];

	2642 // Demote four byte patterns that are more likely Latin1 than UTF-8

	2643 // C9AE, DF92, DF93, DFAB. See note at top.

	2644 // Demotion also boosts Latin1 and CP1252

	2645 uint8 s0 = static_cast<uint8>(s[0]);

	2646 uint8 s1 = static_cast<uint8>(s[1]);

	2647 if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;}

	2648 if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;}

	2649 if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;}

	2650 if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;}

	2651

	2652 if (!ConsecutivePair(destatep, i)) {

	2653 // Insert a blank into the sequence; avoid wrong splices

	2654 sub = (' ' >> 4) & 0x0f;

	2655 ++destatep->utf8_minicount[

	2656 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_m inistate)][sub])];

	2657 destatep->next_utf8_ministate =

	2658 kMiniUTF8State[destatep->next_utf8_ministate][sub];

	2659 }

	2660 // Byte 0

	2661 sub = (s0 >> 4) & 0x0f;

	2662 ++destatep->utf8_minicount[

	2663 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_min istate)][sub])];

	2664 destatep->next_utf8_ministate =

	2665 kMiniUTF8State[destatep->next_utf8_ministate][sub];

	2666 // Byte 1

	2667 sub = (s1 >> 4) & 0x0f;

	2668 ++destatep->utf8_minicount[

	2669 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_min istate)][sub])];

	2670 destatep->next_utf8_ministate =

	2671 kMiniUTF8State[destatep->next_utf8_ministate][sub];

	2672 DCHECK((0 <= destatep->next_utf8_ministate) &&

	2673 (destatep->next_utf8_ministate < 8));

	2674 }

	2675

	2676

	2677 // For the four specific byte combinations above, Latin1/CP1252 is more likely

	2678 if (demotion_count > 0) {

	2679 Boost(destatep, F_Latin1, kGentleOnePair * demotion_count);

	2680 Boost(destatep, F_CP1252, kGentleOnePair * demotion_count);

	2681 }

	2682

	2683 // Boost UTF8 for completed good sequences

	2684 int total_boost = 2 * destatep->utf8_minicount[2] +

	2685 3 * destatep->utf8_minicount[3] +

	2686 4 * destatep->utf8_minicount[4];

	2687 // But not so much for demoted bytes

	2688 total_boost -= (3 * demotion_count);

	2689

	2690 total_boost *= kGentleOnePair;

	2691 total_boost >>= weightshift;

	2692 // Design: boost both UTF8 and UTF8UTF8 for each good sequence

	2693 Boost(destatep, F_UTF8, total_boost);

	2694 Boost(destatep, F_UTF8UTF8, total_boost);

	2695

	2696 destatep->utf8_minicount[5] += destatep->utf8_minicount[2]; // total chars

	2697 destatep->utf8_minicount[5] += destatep->utf8_minicount[3]; // total chars

	2698 destatep->utf8_minicount[5] += destatep->utf8_minicount[4]; // total chars

	2699 destatep->utf8_minicount[2] = 0;

	2700 destatep->utf8_minicount[3] = 0;

	2701 destatep->utf8_minicount[4] = 0;

	2702

	2703 // Whack (2 bytes) for errors

	2704 int error_whack = 2 * destatep->utf8_minicount[1];

	2705 error_whack *= kGentlePairWhack;

	2706 error_whack >>= weightshift;

	2707 Whack(destatep, F_UTF8, error_whack);

	2708 Whack(destatep, F_UTF8UTF8, error_whack);

	2709 destatep->utf8_minicount[1] = 0;

	2710

	2711 return total_boost - error_whack;

	2712 }

	2713

	2714

	2715 // Boost, whack, or leave alone UTF8UTF8 probablilty

	2716 //

	2717 // We are looking for

	2718 // (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the

	2719 // MS CP1252 mappings, and

	2720 // (2) sequences of 2 or more such characters

	2721 //

	2722 // If so, we could be looking at some non-7-bit encoding extra-converted

	2723 // to UTF-8. The most common observed is CP1252->UTF8 twice,

	2724 // 1252=>UTF8 : 1252=>UTF8

	2725 // where the colon means "take those bytes and pretend that they are 1252".

	2726 // We have a couple of examples of BIG5 bytes converted as though

	2727 // they were 1252,

	2728 // BIG5 : 1252=>UTF8

	2729 //

	2730 // Of course, we don't want correctly converted 1252 to be flagged here

	2731 // 1252=>UTF8

	2732 // So we want the input high bytes to be in pairs or longer, hence the

	2733 // output UTF8 in groups of four bytes or more

	2734 //

	2735 // Good chars: C2xx, C3xx,

	2736 // Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C

	2737 // Good chars: E280xx E282AC E284A2

	2738 // C2xx 1100001x 10xxxxxx (128/128)

	2739 // C5xx 11000101 10xx00xx (16/4)

	2740 // C5xx 11000101 10111xxx (8/3)

	2741 // C692 11000110 10010010 (1/1)

	2742 // CBxx 11001011 100xx1x0 (8/2)

	2743 // E28x 11100010 10000xx0 (4/3)

	2744 //

	2745 // Returns total boost

	2746 int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {

	2747 int this_pair = destatep->prior_interesting_pair[OtherPair];

	2748 int startbyteoffset = this_pair * 2;

	2749 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;

	2750 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];

	2751 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];

	2752

	2753 int pair_number = this_pair;

	2754 for (char* s = startbyte; s < endbyte; s += 2) {

	2755 int next = destatep->next_utf8utf8_ministate;

	2756 if (!ConsecutivePair(destatep, pair_number)) {

	2757 // Insert two blanks into the sequence to avoid wrong splices

	2758 // go back to no odd-byte offset

	2759 destatep->utf8utf8_odd_byte = 0;

	2760 int sub = UTF88Sub(' ', ' ');

	2761 ++destatep->utf8utf8_minicount[static_cast<int>(kMiniUTF8UTF8Count[next][s ub])];

	2762 next = kMiniUTF8UTF8State[next][sub];

	2763 }

	2764

	2765 int odd = destatep->utf8utf8_odd_byte;

	2766 if (s + 1 + odd >= endbyte) continue;

	2767 int sub = UTF88Sub(s[0 + odd], s[1 + odd]);

	2768 destatep->utf8utf8_odd_byte ^= kMiniUTF8UTF8Odd[next][sub];

	2769 ++destatep->utf8utf8_minicount[

	2770 static_cast<int>(kMiniUTF8UTF8Count[next][sub])];

	2771 destatep->next_utf8utf8_ministate = kMiniUTF8UTF8State[next][sub];

	2772 ++pair_number;

	2773 }

	2774

	2775 // Boost for completed good sequences; each count covers two chars.

	2776 // Design: boost UTF8UTF8 above UTF8 for each good sequence

	2777 int total_boost = (2) * destatep->utf8utf8_minicount[2] +

	2778 (2) * destatep->utf8utf8_minicount[3] +

	2779 (2) * destatep->utf8utf8_minicount[4];

	2780 total_boost *= kGentleOnePair;

	2781 total_boost >>= weightshift;

	2782 Boost(destatep, F_UTF8UTF8, total_boost);

	2783

	2784 // Track total characters

	2785 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[2];

	2786 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[3];

	2787 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[4];

	2788 destatep->utf8utf8_minicount[2] = 0;

	2789 destatep->utf8utf8_minicount[3] = 0;

	2790 destatep->utf8utf8_minicount[4] = 0;

	2791

	2792 // Design: Do not whack UTF8UTF8 below UTF8 for each bad sequence

	2793

	2794 destatep->utf8utf8_minicount[1] = 0;

	2795 return total_boost;

	2796 }

	2797

	2798

	2799 // boost, whack, or leave alone UTF-32 probablilty

	2800 // Expecting 0000PPxx 0000QQxx where PP mostly = QQ (UTF-32BE)

	2801 // Expecting xxPP0000 xxQQ0000 where PP mostly = QQ (UTF-32LE)

	2802 void CheckUTF32ActiveSeq(DetectEncodingState* destatep) {

	2803 // Not needed

	2804 return;

	2805 }

	2806

	2807 // We give a gentle boost for each paired SO ... SI, whack others

	2808 void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {

	2809 int this_pair = destatep->prior_interesting_pair[OtherPair];

	2810 int startbyteoffset = this_pair * 2;

	2811 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;

	2812 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];

	2813 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];

	2814

	2815 // Initial <esc> char must precede SO/SI

	2816 // HZ_GB_2312 has no alternation constraint on 1- and 2-byte segments

	2817 // ISO-2022-JP (JIS) has no alternation constraint on 1- and 2-byte segments

	2818 // ISO-2022-CN has no alternation constraint on 1- and 2-byte segments

	2819 // ISO-2022-KR requires alternation between 1- and 2-byte segments

	2820 // JIS:

	2821 // <esc> ( B ISO-2022-JP [1b 28 42] SI to ASCII

	2822 // <esc> ( J ISO-2022-JP [1b 28 4a] SI to X0201

	2823 // <esc> $ @ ISO-2022-JP [1b 24 40] SO to X0208-78 twobyte

	2824 // <esc> $ B ISO-2022-JP [1b 24 42] SO to X0208-83 twobyte

	2825 for (char* s = startbyte; s < endbyte; s += 2) {

	2826 if (s[0] == 0x1b) {

	2827 if (s[1] == 0x24) {

	2828 // <esc> $ is SO

	2829 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte

	2830 } else if (s[1] == 0x28) {

	2831 if (destatep->next_2022_state == SOSI_TWOBYTE) {

	2832 Boost(destatep, F_JIS, kGentlePairBoost);

	2833 } else if (destatep->next_2022_state == SOSI_ONEBYTE) {

	2834 Whack(destatep, F_JIS, kGentlePairWhack);

	2835 }

	2836 destatep->next_2022_state = SOSI_ONEBYTE; // JIS SI to one-byte

	2837 } else {

	2838 Whack(destatep, F_JIS, kBadPairWhack);

	2839 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);

	2840 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);

	2841 destatep->next_2022_state = SOSI_ERROR; // not 2022

	2842 }

	2843 } else if (s[0] == 0x0e) {

	2844 // <so>

	2845 Whack(destatep, F_JIS, kBadPairWhack);

	2846 if (destatep->next_2022_state != SOSI_NONE) {

	2847 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte

	2848 } else {

	2849 // ESC required before SO/SI

	2850 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);

	2851 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);

	2852 destatep->next_2022_state = SOSI_ERROR; // SO not after SI

	2853 }

	2854 } else if (s[0] == 0x0f) {

	2855 // <si>

	2856 Whack(destatep, F_JIS, kBadPairWhack);

	2857 if (destatep->next_2022_state != SOSI_NONE) {

	2858 if (destatep->next_2022_state == SOSI_TWOBYTE) {

	2859 Boost(destatep, F_ISO_2022_CN, kGentlePairBoost);

	2860 Boost(destatep, F_ISO_2022_KR, kGentlePairBoost);

	2861 } else if (destatep->next_2022_state == SOSI_ONEBYTE) {

	2862 Whack(destatep, F_ISO_2022_CN, kGentlePairWhack);

	2863 Whack(destatep, F_ISO_2022_KR, kGentlePairWhack);

	2864 }

	2865 destatep->next_2022_state = SOSI_ONEBYTE; // SI to one-byte

	2866 } else {

	2867 // ESC required before SO/SI

	2868 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);

	2869 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);

	2870 destatep->next_2022_state = SOSI_ERROR; // SI not after SO

	2871 }

	2872 } else if (s[0] <= 0x1f) {

	2873 // Some other control code. Allow ht lf [ff] cr

	2874 if ((s[0] != 0x09) && (s[0] != 0x0a) &&

	2875 (s[0] != 0x0c) && (s[0] != 0x0d)) {

	2876 // Otherwise these can float to the top on bad bytes

	2877 Whack(destatep, F_JIS, kBadPairWhack);

	2878 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);

	2879 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);

	2880 }

	2881 }

	2882 }

	2883

	2884 // If no start, keep the probability pinned at zero (or below)

	2885 if (destatep->next_2022_state == SOSI_NONE) {

	2886 destatep->enc_prob[F_ISO_2022_CN] =

	2887 minint(0, destatep->enc_prob[F_ISO_2022_CN]);

	2888 destatep->enc_prob[F_ISO_2022_KR] =

	2889 minint(0, destatep->enc_prob[F_ISO_2022_KR]);

	2890 destatep->enc_prob[F_JIS] =

	2891 minint(0, destatep->enc_prob[F_JIS]);

	2892 }

	2893 }

	2894

	2895 // We give a gentle boost for each paired ~{ ... ~}, whack others

	2896 void CheckHzActiveSeq(DetectEncodingState* destatep) {

	2897 int this_pair = destatep->prior_interesting_pair[AsciiPair];

	2898 int startbyteoffset = this_pair * 2;

	2899 int endbyteoffset = destatep->next_interesting_pair[AsciiPair] * 2;

	2900 char* startbyte = &destatep->interesting_pairs[AsciiPair][startbyteoffset];

	2901 char* endbyte = &destatep->interesting_pairs[AsciiPair][endbyteoffset];

	2902

	2903 for (char* s = startbyte; s < endbyte; s += 2) {

	2904 // Look for initial ~{ pair

	2905 if ((s[0] == '~') && (s[1] == '{')) {

	2906 destatep->next_hz_state = SOSI_TWOBYTE; // SO to two-byte

	2907 }

	2908 // Also look for closing ~} pair

	2909 if ((s[0] == '~') && (s[1] == '}')) {

	2910 if (destatep->next_hz_state == SOSI_TWOBYTE) {

	2911 Boost(destatep, F_HZ_GB_2312, kGentlePairBoost);

	2912 } else if (destatep->next_hz_state == SOSI_ONEBYTE) {

	2913 Whack(destatep, F_HZ_GB_2312, kGentlePairWhack);

	2914 }

	2915 destatep->next_hz_state = SOSI_ONEBYTE; // SI to one-byte

	2916 }

	2917 }

	2918

	2919 // If no start, keep the probability pinned at zero (or below)

	2920 if (destatep->next_hz_state == SOSI_NONE) {

	2921 destatep->enc_prob[F_HZ_GB_2312] =

	2922 minint(0, destatep->enc_prob[F_HZ_GB_2312]);

	2923 }

	2924 }

	2925

	2926 // We give a gentle boost after an odd number of 8Fxxxx triples, which

	2927 // put subsequent bigrams out of phase until a low byte or another 8Fxxxx

	2928 void CheckEucJpSeq(DetectEncodingState* destatep) {

	2929 int this_pair = destatep->prior_interesting_pair[OtherPair];

	2930 int startbyteoffset = this_pair * 2;

	2931 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;

	2932 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];

	2933 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];

	2934

	2935 for (char* s = startbyte; s < endbyte; s += 2) {

	2936 // Boost if out of phase (otherwise, EUC-JP will score badly after 8Fxxxx)

	2937 if (destatep->next_eucjp_oddphase) {

	2938 //printf(" EucJp boost[%02x%02x]\n", s[0], s[1]); // TEMP

	2939 Boost(destatep, F_EUC_JP, kGentlePairBoost * 2);

	2940 }

	2941

	2942 uint8 s0 = static_cast<uint8>(s[0]);

	2943 uint8 s1 = static_cast<uint8>(s[1]);

	2944 // Look for phase flip at 8F

	2945 if ((s0 & 0x80) == 0x00) {

	2946 destatep->next_eucjp_oddphase = false;

	2947 } else if (s0 == 0x8f) {

	2948 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;

	2949 }

	2950 if ((s1 & 0x80) == 0x00) {

	2951 destatep->next_eucjp_oddphase = false;

	2952 } else if (s1 == 0x8f) {

	2953 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;

	2954 }

	2955 }

	2956 }

	2957

	2958 // Boost, whack, or leave alone BINARY probablilty

	2959 // Also called if UTF 16/32 active

	2960 void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep,

	2961 int delta_otherpairs) {

	2962 // No change if not much gathered information

	2963 if (delta_otherpairs == 0) {

	2964 // Only ASCII pairs this call

	2965 return;

	2966 }

	2967 int next_pair = destatep->next_interesting_pair[OtherPair];

	2968

	2969 // Look at density of interesting pairs [0..src)

	2970 int delta_offset = static_cast<int>(src - destatep->initial_src); // actual

	2971

	2972 // Look at density of interesting pairs [0..next_interesting)

	2973 int low_byte = destatep->interesting_offsets[OtherPair][0];

	2974 //int high_byte = destatep->interesting_offsets[OtherPair][next_pair - 1] + 2;

	2975 //int byte_span = high_byte - low_byte;

	2976 int byte_span = delta_offset - low_byte;

	2977

	2978 // If all ASCII for the first 4KB, reject

	2979 // If mostly ASCII in the first 5KB, reject

	2980 if ((low_byte >= kBinaryHardAsciiLimit) \|\| (delta_offset >= kBinarySoftAsciiLi mit)) {

	2981 // Not binary early enough in text

	2982 Whack(destatep, F_BINARY, kBadPairWhack * 4);

	2983 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);

	2984 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);

	2985 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);

	2986 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);

	2987 return;

	2988 }

	2989

	2990 // Density 1.0 for N pairs takes 2*N bytes

	2991 // Whack if < 1/16 after first non_ASCII pair

	2992 if ((next_pair * 2 * 16) < byte_span) {

	2993 // Not dense enough

	2994 Whack(destatep, F_BINARY, kBadPairWhack * 4);

	2995 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);

	2996 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);

	2997 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);

	2998 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);

	2999 }

	3000

	3001 if (next_pair < 8) {

	3002 // Fewer than 8 non-ASCII total; too soon to boost

	3003 return;

	3004 }

	3005

	3006 // Density 1.0 for N pairs takes 2*N bytes

	3007 // Boost if density >= 1/4, whack if < 1/16

	3008 if ((next_pair * 2 * 4) >= byte_span) {

	3009 // Very dense

	3010 // Only boost if at least 2 quadrants seen

	3011 if (destatep->binary_quadrants_count >= 2) {

	3012 Boost(destatep, F_BINARY, kSmallInitDiff);

	3013 Boost(destatep, F_UTF_32BE, kSmallInitDiff);

	3014 Boost(destatep, F_UTF_32LE, kSmallInitDiff);

	3015 Boost(destatep, F_UTF_16BE, kSmallInitDiff);

	3016 Boost(destatep, F_UTF_16LE, kSmallInitDiff);

	3017 }

	3018 }

	3019 }

	3020

	3021

	3022 // Look at a number of special-case encodings whose reliable detection depends

	3023 // on sequencing or other properties

	3024 // AsciiPair probibilities (UTF7 and HZ) are all done here

	3025 void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {

	3026 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -

	3027 destatep->prior_interesting_pair[AsciiPair];

	3028 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -

	3029 destatep->prior_interesting_pair[OtherPair];

	3030

	3031 // The two pure ASCII encodings

	3032 if (UTF7OrHzActive(destatep) && (delta_asciipairs > 0)) {

	3033 // Adjust per pair

	3034 for (int i = 0; i < delta_asciipairs; ++i) {

	3035 int next_pair = destatep->prior_interesting_pair[AsciiPair] + i;

	3036 uint8 byte1 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 0];

	3037 uint8 byte2 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 1];

	3038 if (byte1 == '+') {

	3039 // Boost, whack, or leave alone UTF-7 probablilty

	3040 UTF7BoostWhack(destatep, next_pair, byte2);

	3041 if (destatep->debug_data != NULL) {

	3042 // Show UTF7 entry

	3043 char buff[16];

	3044 snprintf(buff, sizeof(buff), "%02x%02x+", byte1, byte2);

	3045 SetDetailsEncProb(destatep,

	3046 destatep->interesting_offsets[AsciiPair][next_pair],

	3047 kMostLikelyEncoding[(byte1 << 8) + byte2],

	3048 buff);

	3049 }

	3050 } else if (byte1 == '~') {

	3051 // Boost, whack, or leave alone HZ probablilty

	3052 HzBoostWhack(destatep, byte1, byte2);

	3053 if (destatep->debug_data != NULL) {

	3054 // Show Hz entry

	3055 char buff[16];

	3056 snprintf(buff, sizeof(buff), "%02x%02x~", byte1, byte2);

	3057 SetDetailsEncProb(destatep,

	3058 destatep->interesting_offsets[AsciiPair][next_pair],

	3059 kMostLikelyEncoding[(byte1 << 8) + byte2],

	3060 buff);

	3061 }

	3062 }

	3063 }

	3064

	3065 // Kill UTF-7 now if at least 8 + pairs and not confirmed valid UTF-7

	3066 if ((destatep->utf7_starts >= 8) && (destatep->prior_utf7_offset == 0)) {

	3067 Whack(destatep, F_UTF7, kBadPairWhack * 8); // flush

	3068 }

	3069 }

	3070

	3071

	3072

	3073 // All the other encodings

	3074 if (OtherActive(destatep) && (delta_otherpairs > 0)) {

	3075 // Adjust per pair

	3076 int biggest_weightshift = 0;

	3077 for (int i = 0; i < delta_otherpairs; ++i) {

	3078 int next_pair = destatep->prior_interesting_pair[OtherPair] + i;

	3079 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];

	3080 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];

	3081 int off = destatep->interesting_offsets[OtherPair][next_pair];

	3082 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];

	3083 biggest_weightshift = maxint(biggest_weightshift, weightshift);

	3084

	3085 if (byte1 == 0x00) {

	3086 if (byte2 == 0x00) {

	3087 UTF1632BoostWhack(destatep, off, byte1);

	3088 } else if ((kIsPrintableAscii[byte2] != 0) && ((off & 1) != 0)) {

	3089 // We have 00xx at an odd offset. Turn into preceding even offset

	3090 // for possible Ascii text in UTF-16LE or UTF-32LE (vs BE)

	3091 // This will cascade into caller's probability update

	3092 // 00 is illegal for all other encodings, so it doesn't matter to them

	3093 UTF16MakeEven(destatep, next_pair);

	3094 }

	3095 if (destatep->debug_data != NULL) {

	3096 // Show 0000 detail entry for this bigram

	3097 char buff[16];

	3098 snprintf(buff, sizeof(buff), "%02x%02xZ", byte1, byte2);

	3099 SetDetailsEncProb(destatep,

	3100 destatep->interesting_offsets[OtherPair][next_pair],

	3101 kMostLikelyEncoding[(byte1 << 8) + byte2],

	3102 buff);

	3103 }

	3104 }

	3105 if (byte1 == 0xff) {

	3106 if (byte2 == 0xff) {

	3107 UTF1632BoostWhack(destatep, off, byte1);

	3108 }

	3109 if (destatep->debug_data != NULL) {

	3110 // Show FFFF detail entry for this bigram

	3111 char buff[16];

	3112 snprintf(buff, sizeof(buff), "%02x%02xF", byte1, byte2);

	3113 SetDetailsEncProb(destatep,

	3114 destatep->interesting_offsets[OtherPair][next_pair],

	3115 kMostLikelyEncoding[(byte1 << 8) + byte2],

	3116 buff);

	3117 }

	3118 }

	3119 if (BinaryActive(destatep)) {

	3120 BinaryBoostWhack(destatep, byte1, byte2);

	3121 }

	3122 } // End for i

	3123

	3124 // Adjust per entire-pair-span

	3125 int utf8_boost = 0;

	3126 int utf8utf8_boost = 0;

	3127 if (UTF8Active(destatep)) {

	3128 utf8_boost = CheckUTF8Seq(destatep, biggest_weightshift);

	3129 }

	3130

	3131 if (UTF8UTF8Active(destatep)) {

	3132 utf8utf8_boost = CheckUTF8UTF8Seq(destatep, biggest_weightshift);

	3133 }

	3134

	3135 if (UTF1632Active(destatep)) {

	3136 CheckUTF32ActiveSeq(destatep);

	3137 }

	3138

	3139 if (Iso2022Active(destatep)) {

	3140 CheckIso2022ActiveSeq(destatep);

	3141 }

	3142

	3143 if (HzActive(destatep)) {

	3144 CheckHzActiveSeq(destatep);

	3145 }

	3146

	3147 if (EUCJPActive(destatep)) {

	3148 CheckEucJpSeq(destatep);

	3149 }

	3150

	3151 if (BinaryActive(destatep) \|\| UTF1632Active(destatep)) {

	3152 CheckBinaryDensity(src, destatep, delta_otherpairs);

	3153 }

	3154 }

	3155 // ISO-2022 do OK on their own, using stright probabilities? Not on bad bytes

	3156

	3157 if (destatep->debug_data != NULL) {

	3158 // Show sequencing result

	3159 SetDetailsEncLabel(destatep, "seq");

	3160 }

	3161 }

	3162

	3163

	3164 void PrintTopEnc(DetectEncodingState* destatep, int n) {

	3165 // Print top n or fewer

	3166 int temp_sort[NUM_RANKEDENCODING];

	3167 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {

	3168 int rankedencoding = destatep->rankedencoding_list[j];

	3169 temp_sort[j] = destatep->enc_prob[rankedencoding];

	3170 }

	3171

	3172 qsort(temp_sort, destatep->rankedencoding_list_len,

	3173 sizeof(temp_sort[0]), IntCompare);

	3174

	3175 int top_n = minint(n, destatep->rankedencoding_list_len);

	3176 int showme = temp_sort[top_n - 1]; // Print this value and above

	3177

	3178 printf("rankedencodingList top %d: ", top_n);

	3179 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {

	3180 int rankedencoding = destatep->rankedencoding_list[j];

	3181 if (showme <= destatep->enc_prob[rankedencoding]) {

	3182 printf("%s=%d ",

	3183 MyEncodingName(kMapToEncoding[rankedencoding]),

	3184 destatep->enc_prob[rankedencoding]);

	3185 }

	3186 }

	3187 printf("\n\n");

	3188 }

	3189

	3190 // If the same bigram repeats, don't boost its best encoding too much

	3191 bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {

	3192 int this_bigram = (byte1 << 8) \| byte2;

	3193 // If 00xx 01xx 02xx ... 1fxx, take out bottom 4 bits of xx.

	3194 // This ignores parts of Yahoo 0255 0254 0243 0247 0245 0243 0250 0255 ...

	3195 // It may screw up UTF-16BE

	3196 // It may screw up ISO-2022 (1b24 suppresses 1b28)

	3197 if (byte1 < 0x20) {

	3198 this_bigram &= 0xfff0;

	3199 }

	3200 if (this_bigram == destatep->prior_bigram[0]) {return true;}

	3201 if (this_bigram == destatep->prior_bigram[1]) {return true;}

	3202 if (this_bigram == destatep->prior_bigram[2]) {return true;}

	3203 if (this_bigram == destatep->prior_bigram[3]) {return true;}

	3204 // Round-robin replacement

	3205 destatep->prior_bigram[destatep->next_prior_bigram] = this_bigram;

	3206 destatep->next_prior_bigram = (destatep->next_prior_bigram + 1) & 3;

	3207 return false;

	3208 }

	3209

	3210 // Sometimes illegal bytes are used as markers between text that Javascript

	3211 // is going to decode. Don't overboost the Binary encoding for markers 01-FF.

	3212 // Just count first pair per 8x4 bucket

	3213 bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {

	3214 int bucket8x4 = ((byte1 & 0xe0) >> 3) \| ((byte2 & 0xc0) >> 6);

	3215 uint32 bucket8x4_mask = 1 << bucket8x4;

	3216 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {

	3217 destatep->binary_8x4_seen \|= bucket8x4_mask;

	3218 destatep->binary_8x4_count += 1;

	3219 return false;

	3220 }

	3221 return true;

	3222 }

	3223

	3224

	3225

	3226

	3227 // Find current top two rankedencoding probabilities

	3228 void ReRank(DetectEncodingState* destatep) {

	3229 destatep->top_prob = -1;

	3230 destatep->second_top_prob = -1;

	3231 // Leave unchanged

	3232 //destatep->top_rankedencoding =

	3233 // destatep->rankedencoding_list[0]; // Just to make well-defined

	3234 //destatep->second_top_rankedencoding =

	3235 // destatep->rankedencoding_list[1]; // Just to make well-defined

	3236 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	3237 int rankedencoding = destatep->rankedencoding_list[j];

	3238 if (destatep->top_prob < destatep->enc_prob[rankedencoding]) {

	3239 // Make sure top 2 are in different superset groups

	3240 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=

	3241 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {

	3242 destatep->second_top_prob =

	3243 destatep->top_prob; // old top to second

	3244 destatep->second_top_rankedencoding =

	3245 destatep->top_rankedencoding; // old top to second

	3246 }

	3247 destatep->top_prob = destatep->enc_prob[rankedencoding];

	3248 destatep->top_rankedencoding = rankedencoding;

	3249 } else if (destatep->second_top_prob < destatep->enc_prob[rankedencoding]) {

	3250 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=

	3251 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {

	3252 destatep->second_top_prob = destatep->enc_prob[rankedencoding];

	3253 destatep->second_top_rankedencoding = rankedencoding;

	3254 }

	3255 }

	3256 }

	3257 }

	3258

	3259 void SimplePrune(DetectEncodingState* destatep, int prune_diff) {

	3260 // Prune the list of active encoding families

	3261 int keep_prob = destatep->top_prob - prune_diff;

	3262

	3263 destatep->active_special = 0;

	3264 int k = 0;

	3265 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	3266 bool keep = true;

	3267 int rankedencoding = destatep->rankedencoding_list[j];

	3268

	3269 // If count is too low, ditch it

	3270 if (destatep->enc_prob[rankedencoding] < keep_prob) {keep = false;}

	3271

	3272 // Keep it. This will always keep at least top_prob rankedencoding

	3273 if (keep) {

	3274 destatep->active_special \|= kSpecialMask[kMapToEncoding[rankedencoding]];

	3275 destatep->rankedencoding_list[k++] = rankedencoding;

	3276 }

	3277 }

	3278

	3279 destatep->rankedencoding_list_len = k;

	3280 }

	3281

	3282 // Recalculate reliable

	3283 void CalcReliable(DetectEncodingState* destatep) {

	3284 // Encoding result is reliable if big difference in top two, or if

	3285 // only Ascii7 ever encountered

	3286 // Also reliable if exactly one OtherPair and it's best encoding matches top

	3287 destatep->reliable = false;

	3288 if (destatep->next_interesting_pair[OtherPair] == 0) {

	3289 // Only 7-bit ASCII

	3290 destatep->reliable = true;

	3291 return;

	3292 }

	3293 if ((destatep->top_prob - destatep->second_top_prob) >=

	3294 FLAGS_ced_reliable_difference) {

	3295 destatep->reliable = true;

	3296 return;

	3297 }

	3298 if (destatep->next_interesting_pair[OtherPair] == 1) {

	3299 uint8 byte1 = destatep->interesting_pairs[OtherPair][0];

	3300 uint8 byte2 = destatep->interesting_pairs[OtherPair][1];

	3301 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];

	3302 if (best_enc == destatep->top_rankedencoding) {

	3303 destatep->reliable = true;

	3304 return;

	3305 }

	3306 }

	3307

	3308 // If we pruned to one encoding, we are done

	3309 if (destatep->rankedencoding_list_len == 1) {

	3310 destatep->reliable = true;

	3311 destatep->done = true;

	3312 return;

	3313 }

	3314

	3315 // If we pruned to two or three encodings in the same *superset/subset

	3316 // rankedencoding* and enough pairs, we are done. Else keep going

	3317 if (destatep->rankedencoding_list_len == 2) {

	3318 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];

	3319 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];

	3320 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {

	3321 if (destatep->prune_count >= 3) {

	3322 destatep->reliable = true;

	3323 destatep->done = true;

	3324 return;

	3325 }

	3326 }

	3327 } else if (destatep->rankedencoding_list_len == 3) {

	3328 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];

	3329 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];

	3330 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];

	3331 Encoding base0 = kMapEncToBaseEncoding[enc0];

	3332 Encoding base1 = kMapEncToBaseEncoding[enc1];

	3333 Encoding base2 = kMapEncToBaseEncoding[enc2];

	3334

	3335 if ((base0 == base1) && (base0 == base2)) {

	3336 if (destatep->prune_count >= 3) {

	3337 destatep->reliable = true;

	3338 destatep->done = true;

	3339 return;

	3340 }

	3341 }

	3342 }

	3343

	3344 }

	3345

	3346

	3347 // Find current top two rankedencoding probabilities

	3348 void FindTop2(DetectEncodingState* destatep,

	3349 int* first_renc, int* second_renc,

	3350 int* first_prob, int* second_prob) {

	3351 *first_prob = -1;

	3352 *second_prob = -1;

	3353 *first_renc = 0;

	3354 *second_renc = 0;

	3355 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	3356 int rankedencoding = destatep->rankedencoding_list[j];

	3357 if (*first_prob < destatep->enc_prob[rankedencoding]) {

	3358 second_prob = first_prob; // old top to second

	3359 second_renc = first_renc; // old top to second

	3360 *first_prob = destatep->enc_prob[rankedencoding];

	3361 *first_renc = rankedencoding;

	3362 } else if (*second_prob < destatep->enc_prob[rankedencoding]) {

	3363 *second_prob = destatep->enc_prob[rankedencoding];

	3364 *second_renc = rankedencoding;

	3365 }

	3366 }

	3367 }

	3368

	3369

	3370 void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) {

	3371 printf("Current ranked encoding list %s\n", str);

	3372 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	3373 int rankedencoding = destatep->rankedencoding_list[j];

	3374 if ((rankedencoding < 0) \|\| (rankedencoding > NUM_RANKEDENCODING)) {

	3375 printf(" [%d] BOGUS rankedencoding = %d\n", j, rankedencoding);

	3376 } else {

	3377 printf(" [%d] rankedencoding = %d %-12.12s enc_prob = %d\n",

	3378 j, rankedencoding, MyRankedEncName(rankedencoding),

	3379 destatep->enc_prob[rankedencoding]);

	3380 }

	3381 }

	3382 printf("End current ranked encoding list\n\n");

	3383 }

	3384

	3385

	3386

	3387

	3388 // Map unencoded bytes down to five bits, largely preserving letters

	3389 // This design struggles to put 33 values into 5 bits.

	3390 #define XX 0 // Punctuation (00-7F range)

	3391 #define HA 27 // High vowel a in Latin1/2/sometimes7

	3392 #define HE 28 // High vowel e

	3393 #define HI 29 // High vowel i

	3394 #define HO 30 // High vowel o

	3395 #define HU 30 // High vowel u on top of HO

	3396 #define Hc 31 // High consonant (80-FF range)

	3397 static const char kMapToFiveBits[256] = {

	3398 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,

	3399 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,

	3400 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,

	3401 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,

	3402

	3403 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,

	3404 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,

	3405 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,

	3406 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,

	3407

	3408 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,

	3409 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,

	3410 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,

	3411 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,

	3412

	3413 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,

	3414 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,

	3415 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,

	3416 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,

	3417

	3418 };

	3419 #undef XX

	3420 #undef HA

	3421 #undef HE

	3422 #undef HI

	3423 #undef HO

	3424 #undef HU

	3425 #undef Hc

	3426

	3427 static const int kTriNoneLikely = 0;

	3428 static const int kTriLatin1Likely = 1;

	3429 static const int kTriLatin2Likely = 2;

	3430 static const int kTriLatin7Likely = 3;

	3431

	3432 // Each table entry has 32 times two bits, selected by byte[2]

	3433 // Entry subscript is selected by byte[0] and byte[1]

	3434 // Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc

	3435 static const uint64 kLatin127Trigrams[1024] = {

	3436 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3437 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3438 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3439 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3440 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3441 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3442 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3443 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x000000000 0000000ULL,

	3444 0x0000000000000000ULL, 0x304080c0402c3330ULL, 0x0008400004000000ULL, 0x082800000 c200000ULL,

	3445 0x23a0000420800030ULL, 0x00000000000ccc00ULL, 0x0500100100100000ULL, 0x038840000 0200010ULL,

	3446 0x0000000000000c00ULL, 0xd0f0300740f0cf00ULL, 0x2aa0a2a22882a2acULL, 0x081d80000 0000080ULL,

	3447 0x0c82000020000000ULL, 0x200a03c000a00000ULL, 0x0008400400290000ULL, 0x040087000 0000000ULL,

	3448 0x00f040c00000c080ULL, 0x0008004000000410ULL, 0x0020300000000030ULL, 0x00a030002 c300000ULL,

	3449 0x0c8030c020a00000ULL, 0x15410030f0f4c000ULL, 0x3000000300a00000ULL, 0xa2880980a 0880a88ULL,

	3450 0x0900300000000000ULL, 0x0000040100300000ULL, 0x0888820020a00000ULL, 0xc04400224 2010000ULL,

	3451 0x000000121d300040ULL, 0x40100040440c0d54ULL, 0x00008423102f8144ULL, 0x0b4080840 0000280ULL,

	3452 0x0000000000000000ULL, 0x0680a000000c0000ULL, 0x0880008020aa0000ULL, 0x2aaa01410 10a4940ULL,

	3453 0xcb80000000010000ULL, 0x2280000000000000ULL, 0x5248000001800000ULL, 0x800040100 4040010ULL,

	3454 0x1540010201001010ULL, 0x0080080400000000ULL, 0x5a00044040000108ULL, 0x028800028 2080008ULL,

	3455 0x4800008002200000ULL, 0x4a00000000010100ULL, 0x8a88040080000800ULL, 0x014080000 0000400ULL,

	3456 0x40010050000c0000ULL, 0x0000008000000000ULL, 0x0028000020140040ULL, 0x862040140 1005308ULL,

	3457 0xc082000000000400ULL, 0x05c0b004c0240600ULL, 0x0288000080000000ULL, 0x000001400 0000000ULL,

	3458 0x00000000040000c0ULL, 0x8001861008004280ULL, 0x0200000000000300ULL, 0x000024024 2288620ULL,

	3459 0x801000c05434c200ULL, 0x9020162040a2d2b4ULL, 0x0021840000240704ULL, 0x2a8028008 0084908ULL,

	3460 0x0000000000000000ULL, 0x0500004000000040ULL, 0x0080000000040000ULL, 0x010805810 4440000ULL,

	3461 0x0900000000040000ULL, 0x00c0000000208008ULL, 0x2000005000000000ULL, 0x008000000 0050000ULL,

	3462 0x0808000000001080ULL, 0x9880810100308000ULL, 0x2285480080081a08ULL, 0x8a8000008 0080000ULL,

	3463 0x1450000000600010ULL, 0x2210000100000000ULL, 0x8a88000100011000ULL, 0x154180400 0000010ULL,

	3464 0xc084011140040100ULL, 0x0000000000000800ULL, 0x0400000000000030ULL, 0x2a800000a 0890128ULL,

	3465 0x1140a00054000104ULL, 0x1440000101200404ULL, 0x028800400400d800ULL, 0x000000000 0000000ULL,

	3466 0x0000000000002330ULL, 0x0020820228a02280ULL, 0xa2888a02aa8008a8ULL, 0xd0040a004 4202500ULL,

	3467 0x8000044104a29424ULL, 0xc000100178b2c5b4ULL, 0x0000810100241504ULL, 0xd04003000 0380008ULL,

	3468 0x0000000000000000ULL, 0x26c08c0000200130ULL, 0x4a08000110080000ULL, 0x2aa000400 1080800ULL,

	3469 0x0aac000000004000ULL, 0x2000000000200000ULL, 0x4240000100020000ULL, 0x410000008 0000000ULL,

	3470 0x4900040000000000ULL, 0x0800000400300040ULL, 0x6a80000000040800ULL, 0x2a0818200 0588008ULL,

	3471 0x0a00000c81000008ULL, 0x0a000c0010000000ULL, 0x8a88001080280808ULL, 0x002000020 0300600ULL,

	3472 0xaac00000900a0000ULL, 0x0000100004000000ULL, 0x0020081020000000ULL, 0x822010501 0084110ULL,

	3473 0x4a80800000004000ULL, 0x050000c0c0200000ULL, 0x288c000084000000ULL, 0xa04808228 0000000ULL,

	3474 0x0000000000000000ULL, 0x8000900000032080ULL, 0xee889e81b8880820ULL, 0xc2200a814 2800424ULL,

	3475 0xc020141543361010ULL, 0x10a000204a801634ULL, 0x3a808800802a00a0ULL, 0x28808b008 03d0800ULL,

	3476 0x0000000000000000ULL, 0x0020000000000030ULL, 0x0808400121010040ULL, 0x0c2824010 0200040ULL,

	3477 0x2008200028800000ULL, 0xc10004c80f30c030ULL, 0x0400440114100000ULL, 0x220820028 0a22220ULL,

	3478 0x0600000030c01000ULL, 0x1201001040c00000ULL, 0x0aa02ea22aa22aa0ULL, 0x300080000 00200a0ULL,

	3479 0x20c8400400800000ULL, 0x08280b0420800000ULL, 0x0800100000210000ULL, 0x10000300c 0100400ULL,

	3480 0xc8c0000420000000ULL, 0x1000000010000000ULL, 0x0420000400000000ULL, 0x022000050 0204000ULL,

	3481 0x2200000420000000ULL, 0x0000540400000000ULL, 0x0000000020000000ULL, 0x00080c00a 0810080ULL,

	3482 0x1540000000043000ULL, 0x0000000000100000ULL, 0x2e88a22220200a20ULL, 0xc06030e34 ea503a0ULL,

	3483 0x0001100204048500ULL, 0x000000e0000c0d54ULL, 0x3000820310a31400ULL, 0x13088c032 0e00280ULL,

	3484 0x0000000000000000ULL, 0x0480000000200000ULL, 0x4000200100000000ULL, 0x000030004 0040000ULL,

	3485 0x4400000000000000ULL, 0x0401000002240000ULL, 0x0540000000040000ULL, 0x400401000 0000000ULL,

	3486 0x4001111001100000ULL, 0x2880000000300040ULL, 0x4040004040002404ULL, 0x020000000 0000000ULL,

	3487 0x0140040000100000ULL, 0x4040010040040080ULL, 0x0a00140000041004ULL, 0x0000a0040 0808000ULL,

	3488 0x1010200000430040ULL, 0x0010000000000000ULL, 0x0540000000104000ULL, 0x140011400 5000000ULL,

	3489 0x0000204000440010ULL, 0x0500000000004400ULL, 0x4500000018000400ULL, 0x000040000 0000000ULL,

	3490 0x000000300000cc00ULL, 0x0100001011300000ULL, 0x0040000000000000ULL, 0xc0e000024 8a00444ULL,

	3491 0x0000040020340144ULL, 0x0000046445105454ULL, 0x32a0a80280880128ULL, 0x088004000 0100100ULL,

	3492 0x0000000000000000ULL, 0x14003000030c0004ULL, 0x4a04001100000000ULL, 0x0a0010801 0000000ULL,

	3493 0x28a8004000200248ULL, 0x0100040000b00000ULL, 0x42000000000008c0ULL, 0x600804401 0550010ULL,

	3494 0x0800401000010400ULL, 0x080080040cf80000ULL, 0x5080000001001010ULL, 0x2a8010000 0000000ULL,

	3495 0xcc8010010d401100ULL, 0x0200000001001000ULL, 0x0480001004001000ULL, 0x8d0080004 0b40210ULL,

	3496 0x6200800000300000ULL, 0x0000010000000000ULL, 0x0428004100010000ULL, 0x432010514 1501100ULL,

	3497 0xe28c0000000c1000ULL, 0xd5c000c3c0e00300ULL, 0x0001000000100200ULL, 0x100401020 2400008ULL,

	3498 0x0000000000003000ULL, 0x2aa038a0800aab08ULL, 0x2a88038000000000ULL, 0xc22004024 2f09720ULL,

	3499 0x8020200200ba0420ULL, 0x0020106105101004ULL, 0x0480800000220400ULL, 0x228010008 0000008ULL,

	3500 0x0000000000000000ULL, 0x9000000000200000ULL, 0x0001000000100000ULL, 0x2aa40c000 0080800ULL,

	3501 0x0040000040010000ULL, 0x0040000000c01000ULL, 0x4000000040000400ULL, 0x000000100 0200000ULL,

	3502 0x0000010000000000ULL, 0x05808004000c0000ULL, 0x50400c0000000400ULL, 0x020040008 f000040ULL,

	3503 0x0800000000100000ULL, 0x0000000000000000ULL, 0x0a08440000004000ULL, 0x006400040 0008200ULL,

	3504 0x0010010010034170ULL, 0x0000000010000000ULL, 0x0100204021000000ULL, 0x022000d00 0010100ULL,

	3505 0x0840300000c00000ULL, 0x1400000040204400ULL, 0x09800c0040000000ULL, 0x020970800 0000000ULL,

	3506 0x000000000000c040ULL, 0x90000c50204040a0ULL, 0x0000000000000000ULL, 0x00e150004 0200004ULL,

	3507 0x8020260540204494ULL, 0x0020026150201054ULL, 0x0281800380105634ULL, 0x088490048 1105000ULL,

	3508 0x0000000000000000ULL, 0x84203c00002c0200ULL, 0xc089040000000000ULL, 0xc2a810004 0200004ULL,

	3509 0xe00c1c0000000000ULL, 0x0ce1330080200080ULL, 0x0000000000200000ULL, 0xc40011000 0404010ULL,

	3510 0x0088400000000000ULL, 0x00083cc00c00c00cULL, 0xcac01c00c000580cULL, 0xe300b0f00 0100000ULL,

	3511 0x0300000000000000ULL, 0xc0000f0000000000ULL, 0xc3c01c0400000000ULL, 0x81008004c 0f40000ULL,

	3512 0xc3d8003000000440ULL, 0x0000000000000000ULL, 0xc430000000000000ULL, 0x006000000 0001000ULL,

	3513 0x0800000000000000ULL, 0x00c03300f0fc0008ULL, 0x3000000400200010ULL, 0xa2a80892a 0880a28ULL,

	3514 0x0500000040000004ULL, 0x0000000000000000ULL, 0xc80032070c200020ULL, 0x022082006 0a296a0ULL,

	3515 0x802084021db486a0ULL, 0x00000d60080c0080ULL, 0xb281803313a32428ULL, 0x180830032 0300000ULL,

	3516 0x0000000000000000ULL, 0x85208cc0ccac1f20ULL, 0x2081000186100808ULL, 0x22a808800 00a0808ULL,

	3517 0xaaa8086880000000ULL, 0x802084800a2e9200ULL, 0xa280000000002008ULL, 0xa00000008 0080400ULL,

	3518 0x2080010000000008ULL, 0x802020c00c028c80ULL, 0x2080000000140810ULL, 0x2a8008608 0080008ULL,

	3519 0x2a800000a8000800ULL, 0xaa881800a2080800ULL, 0xaa98004080280808ULL, 0x004483d0c 0300000ULL,

	3520 0xa280002080080000ULL, 0x0000000000300000ULL, 0x22a1030000000008ULL, 0xa8a030108 8880880ULL,

	3521 0xaa80002080222808ULL, 0x85400c03fc030400ULL, 0x8a88000000000008ULL, 0xa00800801 0080008ULL,

	3522 0x0000000000010000ULL, 0x0040100000301040ULL, 0x28800000a0002008ULL, 0x122482306 cbc0eacULL,

	3523 0x8020224222b8c6a0ULL, 0x802002004a82c284ULL, 0x0aa08fc440a41c80ULL, 0x888080d18 1385098ULL,

	3524 0x0000000000000000ULL, 0x00c0b000000c0080ULL, 0x2208001000000800ULL, 0x0a2800000 0200000ULL,

	3525 0x0000000300000000ULL, 0x00c1040000200000ULL, 0x0203020000000000ULL, 0x024800000 0020000ULL,

	3526 0x0000840000100000ULL, 0x0a808c00c000008cULL, 0x5200040040000004ULL, 0x02000c000 00080a0ULL,

	3527 0x0b0c000020000000ULL, 0x0b04000001000000ULL, 0x088c0010002000c0ULL, 0x80e08b00c 0030c20ULL,

	3528 0x0280000200014040ULL, 0x0000000000000000ULL, 0x0e20a0a008000020ULL, 0x0e280fd03 f00111cULL,

	3529 0x200080c020001000ULL, 0x8cc00c02c02f0400ULL, 0x480c0001000c404cULL, 0x020801428 1080808ULL,

	3530 0x000000000000fcfcULL, 0x004403300cf00030ULL, 0x2200000000004400ULL, 0x02202000c 08c0c20ULL,

	3531 0x02202022683a80a0ULL, 0x4020228028008c00ULL, 0x32208cc0002c0200ULL, 0x3ec00c008 0304008ULL,

	3532 0x0000000000000000ULL, 0x34000c00002c0000ULL, 0x0b00000100100030ULL, 0x082301800 0000000ULL,

	3533 0x0e8c001c01e00000ULL, 0x1200800600330000ULL, 0x4000110000000000ULL, 0x008000030 0000000ULL,

	3534 0x0800000000000000ULL, 0x08c08c04000c0000ULL, 0x0080400000880000ULL, 0x0a0800008 0c00008ULL,

	3535 0x0800000304400000ULL, 0x0208000000c00000ULL, 0x2888300080400800ULL, 0x8dc020440 0000000ULL,

	3536 0xc0000000c0800000ULL, 0x0000c10000000000ULL, 0x24000c4010c00000ULL, 0x272000541 d811000ULL,

	3537 0x0200400000001000ULL, 0x0400000400001004ULL, 0xc08c007004001000ULL, 0x204800400 0000000ULL,

	3538 0x000000000003fcfcULL, 0x2aa030000cf8c800ULL, 0xe280000000000000ULL, 0x0a2100814 2000340ULL,

	3539 0x0021002000b61040ULL, 0x800004064006d444ULL, 0x3aa0800300230008ULL, 0x0b0003000 0300000ULL,

	3540 0x0000000000000000ULL, 0x01c080000000040cULL, 0x0100000000004000ULL, 0x0aa801801 0001000ULL,

	3541 0x0800000000100000ULL, 0x3000000000008c00ULL, 0x5400000013000000ULL, 0x02c0c0000 4004010ULL,

	3542 0x5241100010000c00ULL, 0x0e00080000000808ULL, 0x5281000000000800ULL, 0x0a0810802 0000800ULL,

	3543 0x0a80000000005210ULL, 0x0100000041000000ULL, 0x2a88000002080110ULL, 0x852080000 0c00080ULL,

	3544 0x01000010108c0100ULL, 0x0000000000000000ULL, 0x42a0420080000000ULL, 0x002000100 4010010ULL,

	3545 0xc4000000000c0000ULL, 0x01000c00c0200400ULL, 0x4600000100000000ULL, 0x000000000 0000000ULL,

	3546 0x0010001000000010ULL, 0x910400900820d030ULL, 0x2280000000000000ULL, 0xc22120044 00040e4ULL,

	3547 0x8001000000b61420ULL, 0xa00002a248e810b4ULL, 0x32008000002c0008ULL, 0x0c0100348 03c5010ULL,

	3548 0x0000000000000000ULL, 0x85008002002c0000ULL, 0x0204001000004010ULL, 0x012000800 0200000ULL,

	3549 0x000010000c2000c0ULL, 0xccc0000000200000ULL, 0x0400000c00100040ULL, 0x000330010 0004100ULL,

	3550 0x4000551040000004ULL, 0x0e0080000c820808ULL, 0xc000000000080800ULL, 0xc80300000 0000000ULL,

	3551 0x0a4000c000200000ULL, 0x0040000000c00000ULL, 0x0918145000405000ULL, 0x81400000c 0300400ULL,

	3552 0x0050000000000000ULL, 0xd000045000000000ULL, 0x0400004000400000ULL, 0x042010401 0000110ULL,

	3553 0x0700000000203000ULL, 0x34800300c0e00704ULL, 0x4440100044000400ULL, 0x004000004 0000000ULL,

	3554 0x0030000044000000ULL, 0xeaaca0008808c880ULL, 0x0a01000000200000ULL, 0x1220a3004 03ccf20ULL,

	3555 0x002024c200b61044ULL, 0x802014346aa2d434ULL, 0x30008c00c0820c44ULL, 0x0a0000000 00c4800ULL,

	3556 0x0000000000000000ULL, 0x0000404000340c90ULL, 0x08a8a10820800280ULL, 0x812800902 2201000ULL,

	3557 0x0020808228a000a0ULL, 0x0020400100410000ULL, 0x0400000110000000ULL, 0xa60900000 0200000ULL,

	3558 0x8008330000d00000ULL, 0x8060100040404010ULL, 0xeaa00ea0ea00808cULL, 0x200c8020a 0000020ULL,

	3559 0x0408800020200000ULL, 0x0189001403200000ULL, 0xc00800000000c000ULL, 0x200430c00 c300000ULL,

	3560 0x0100300100004000ULL, 0x0000040000000000ULL, 0x2420000400001000ULL, 0x89a120040 0000000ULL,

	3561 0x20c8a000208c0000ULL, 0x8080000000000000ULL, 0x28a0108020210080ULL, 0xa2a84800a 0880988ULL,

	3562 0x258008000400c000ULL, 0x0140000000100000ULL, 0xa028a222a0aa0228ULL, 0xc06001205 4044040ULL,

	3563 0x0010010400000000ULL, 0x00000050150c0114ULL, 0x0000008010c20010ULL, 0xaa088000a 0200880ULL,

	3564 0x0000000000000000ULL, 0x0700b0c0000c0000ULL, 0x2200040000080030ULL, 0x2aa880804 0240800ULL,

	3565 0x08b0500000000100ULL, 0x1000830400200000ULL, 0x4204000010000000ULL, 0x40c220005 0040050ULL,

	3566 0x0104404001010000ULL, 0x1a808c8103c00030ULL, 0x30900010c0000b00ULL, 0x200812b28 3000008ULL,

	3567 0x000c000020e00000ULL, 0x2140000000400000ULL, 0x0288000080200000ULL, 0x8060a200c 8a20280ULL,

	3568 0x0400114010215000ULL, 0x0000000000000000ULL, 0x082b200002000010ULL, 0x22a003000 0031000ULL,

	3569 0x008100001000000cULL, 0x05400c00c0230400ULL, 0xca3000003c080100ULL, 0x000000002 0000004ULL,

	3570 0x0000000100000000ULL, 0x8004320813f5c000ULL, 0xa280080200000800ULL, 0xc22000044 e334c20ULL,

	3571 0x000004146e361024ULL, 0x800126806aa0d584ULL, 0xb000a0040023c41cULL, 0x0a0830008 03053d8ULL,

	3572 0x0000000000000000ULL, 0x0000100000020000ULL, 0x0000000010000010ULL, 0x000000004 5040004ULL,

	3573 0x0000000000100000ULL, 0x0000020400000010ULL, 0x0003015000000000ULL, 0x040000000 0000000ULL,

	3574 0x0000000400000000ULL, 0x0100000000000800ULL, 0x0000001000000000ULL, 0x000000000 0000000ULL,

	3575 0x0000000040000000ULL, 0x0000000000000000ULL, 0x0004001000000000ULL, 0x000800100 0000000ULL,

	3576 0x0010000000000004ULL, 0x0000010100001000ULL, 0x0004000000000004ULL, 0x000001404 0050014ULL,

	3577 0x0014000000000040ULL, 0x5540000000041000ULL, 0x0000000000000000ULL, 0x000004000 0000d00ULL,

	3578 0x0000000000000000ULL, 0x0000000000100000ULL, 0x0001000000000000ULL, 0x000000000 0000000ULL,

	3579 0x0000000000000000ULL, 0x0000000000000000ULL, 0x4500000000040400ULL, 0x000080000 0000400ULL,

	3580 0x0000000000000000ULL, 0x13e080000020000cULL, 0xcf00001005100000ULL, 0x04a800800 0200300ULL,

	3581 0x00280100100000c0ULL, 0x1c8c000040200000ULL, 0x0600005000100000ULL, 0x050800000 c104000ULL,

	3582 0x4c10101000110000ULL, 0x0c00000000300000ULL, 0x22040c00100000c0ULL, 0x080070001 0100000ULL,

	3583 0x0000000000001000ULL, 0x0a08000010000040ULL, 0x0800034004210010ULL, 0x04e000040 0000000ULL,

	3584 0x0800030020000000ULL, 0x0000005000000000ULL, 0x0400110101304110ULL, 0x042800001 0a01000ULL,

	3585 0x060b000000800010ULL, 0x35810c00c020c000ULL, 0x00800c4321800000ULL, 0x420808802 0000080ULL,

	3586 0x040000111003ff00ULL, 0x0020900020202080ULL, 0x22888180a8000888ULL, 0x022520054 2005420ULL,

	3587 0x2020040400340020ULL, 0x10300424500cc444ULL, 0x3081a00400e00200ULL, 0x33001300c 0300000ULL,

	3588 0x0000000000000000ULL, 0x04003c0000000000ULL, 0x0a04001000100100ULL, 0x140800000 1000000ULL,

	3589 0x1800000044100000ULL, 0x3400040400000300ULL, 0x5000040801000040ULL, 0x408840104 0000040ULL,

	3590 0x1010110130100000ULL, 0xca800c3000300000ULL, 0x5a01000000080100ULL, 0x020280000 cd01300ULL,

	3591 0x0302000410200010ULL, 0x0000102000300000ULL, 0x0b09000000000000ULL, 0x20008004c 4800004ULL,

	3592 0x28c0410010000000ULL, 0x0004015041000050ULL, 0x0a01006000200200ULL, 0x0020d0000 0100040ULL,

	3593 0x0010a00100900000ULL, 0x3500bf00c0030300ULL, 0x080c010000200d00ULL, 0x224800000 4020010ULL,

	3594 0x0000c00000000000ULL, 0x8044b00200e08000ULL, 0xaaa82aa2aa8a2aa8ULL, 0x022000224 1c08604ULL,

	3595 0x4200260440328444ULL, 0x68001226103008b4ULL, 0x3a0080c0b0000400ULL, 0x2a8048048 03c4008ULL,

	3596 0x0000000000000000ULL, 0x04008c0300000400ULL, 0x008000c0000c0000ULL, 0x088001000 000001cULL,

	3597 0x0840000001000010ULL, 0x0400000000200c00ULL, 0x4244000101040000ULL, 0x423800701 1100000ULL,

	3598 0x1000d00100000010ULL, 0x1d00800400300000ULL, 0x4204080c00000000ULL, 0x2a8808008 0000008ULL,

	3599 0x08001c0200001000ULL, 0x0a00000400000000ULL, 0x8a88003080080000ULL, 0x052180040 0300000ULL,

	3600 0x3200051000201000ULL, 0x0000000000000000ULL, 0x0020801404000000ULL, 0x322010401 c0c101cULL,

	3601 0x0c01100013000000ULL, 0x04003000c0204000ULL, 0x088c0020a0cc0000ULL, 0x220000008 0000018ULL,

	3602 0x0404000044000000ULL, 0x82a0b000008820b0ULL, 0x0000040020440000ULL, 0xc26500044 03f1420ULL,

	3603 0x0021340241b64464ULL, 0x8020040242c2d474ULL, 0x32018c0480288000ULL, 0x00800b008 0300000ULL,

	3604 0x0000000000000000ULL, 0x05008c0000040130ULL, 0xc0d8000000800000ULL, 0x002000002 0200200ULL,

	3605 0x23a2000120204000ULL, 0x5052100550104150ULL, 0x1000101100040000ULL, 0xc40001c30 1000000ULL,

	3606 0x8288000000c00000ULL, 0x5150040144d01404ULL, 0xea8c0ea028ae088cULL, 0xc31010c00 0000c80ULL,

	3607 0x0002000060000000ULL, 0xc80800f030000000ULL, 0x0000000400300000ULL, 0xc00080c00 ff0c344ULL,

	3608 0x00080001200c0000ULL, 0x0000050080000000ULL, 0x0328000300300000ULL, 0x082030000 cc01040ULL,

	3609 0xeb08800100004000ULL, 0x8030003300c80f00ULL, 0xfb0d0000e4ac0000ULL, 0x002000608 0000008ULL,

	3610 0x0500100100040000ULL, 0x1140000000000000ULL, 0xcb883330a0e00000ULL, 0xc00001005 0000080ULL,

	3611 0x0010104005b54150ULL, 0x40111d5155001554ULL, 0x80000070140f0004ULL, 0x0b0830c3a 0003380ULL,

	3612 0x0000000000000000ULL, 0x04c13000000f830cULL, 0x2808000000000000ULL, 0x281000000 0000800ULL,

	3613 0x08c0080004400000ULL, 0x04c0240300801c20ULL, 0x4040000080000004ULL, 0x000040010 0100010ULL,

	3614 0x020001008000c0c0ULL, 0x1d008c000c3c0000ULL, 0x0080003000000800ULL, 0x228808008 0000008ULL,

	3615 0x0a84004020220000ULL, 0x0800080000100000ULL, 0xaa80004080400008ULL, 0x802400040 0c01660ULL,

	3616 0x80841c2001000104ULL, 0x0001000000000000ULL, 0x0020028020020280ULL, 0x086040401 1900100ULL,

	3617 0xec80080200000000ULL, 0x010103c100200400ULL, 0x0200004000000000ULL, 0x000000000 0400400ULL,

	3618 0x000010000003fcfcULL, 0x8040083238c20000ULL, 0x08800220a0920a00ULL, 0x082100044 83c0c24ULL,

	3619 0xc020240740b0a200ULL, 0x802006014a201494ULL, 0x3201233070ac0e00ULL, 0x080028060 33a48a0ULL,

	3620 0x0000000000000000ULL, 0x8020820028a00680ULL, 0x2000002000000104ULL, 0x22a808011 00a0808ULL,

	3621 0xa2a8002080000000ULL, 0xa000800008a08000ULL, 0x0000100000400000ULL, 0x800000210 0000000ULL,

	3622 0x0000010000004404ULL, 0xa2a0088080000888ULL, 0x0000000010400800ULL, 0xa28008208 0080008ULL,

	3623 0x2280000080010008ULL, 0x2000000000000000ULL, 0x228800008c080808ULL, 0x802182800 2a98200ULL,

	3624 0xa200002000080000ULL, 0x0000040000000000ULL, 0x22a0000080000000ULL, 0x202882c20 0800080ULL,

	3625 0xa000000001004000ULL, 0x000000c808a00600ULL, 0x0000000010000000ULL, 0x000001000 000040cULL,

	3626 0x0000000000000000ULL, 0x802002a2a8aa82a0ULL, 0x20000024a8088228ULL, 0x802082000 1000000ULL,

	3627 0x8020000000808280ULL, 0x8000000000000000ULL, 0x0020800000200280ULL, 0x208008228 0a00888ULL,

	3628 0x0000000000000000ULL, 0x0000015000000040ULL, 0x0000040000040000ULL, 0x010001001 0001000ULL,

	3629 0x0000003210008000ULL, 0x0000000404000000ULL, 0x0000000000000400ULL, 0x020000000 0000000ULL,

	3630 0x0000000000000100ULL, 0x5180014400004050ULL, 0x1000000014000000ULL, 0x420000000 0000000ULL,

	3631 0x0040200000000000ULL, 0x0201004000000000ULL, 0x0a00000000000010ULL, 0x004020000 0800000ULL,

	3632 0x0040051000000500ULL, 0x0000000100800400ULL, 0x6000000000000000ULL, 0x000000000 0000000ULL,

	3633 0x280000c1400040ccULL, 0x4180001000000000ULL, 0x00000000c1000104ULL, 0x000000000 0000000ULL,

	3634 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0080000000c00000ULL, 0x000400606 6004000ULL,

	3635 0x0000005000040440ULL, 0x0000106005804044ULL, 0x0000a10511004440ULL, 0x000000000 0000110ULL,

	3636 0x0000000000000000ULL, 0x0000000000080000ULL, 0xeb0808a020800080ULL, 0x29a800810 02a1800ULL,

	3637 0x0b2c000202100100ULL, 0x0001000000888000ULL, 0x2280102010000000ULL, 0x020000602 a004110ULL,

	3638 0x8a800160a6108100ULL, 0x0280000000000020ULL, 0x8a8000a0a8808208ULL, 0x028088208 0500308ULL,

	3639 0x0b18010020804100ULL, 0xeb080000c0080080ULL, 0x2b08000000810130ULL, 0x000000000 8040020ULL,

	3640 0xaa0a08e082894140ULL, 0x0000000000000000ULL, 0x202081409010001cULL, 0x8aa880508 2806000ULL,

	3641 0xeb082900289c0000ULL, 0x0000000000008000ULL, 0xf80c2e20002e0000ULL, 0xa28808042 0880888ULL,

	3642 0x0000010000000000ULL, 0x0000000000102000ULL, 0x22880000a8a80808ULL, 0x022022a22 aa880a0ULL,

	3643 0x0000222222aa0620ULL, 0x0000022002800000ULL, 0x208080004028a000ULL, 0x2b8888008 01c0828ULL,

	3644 0x0000000000000000ULL, 0x22e0828280a08028ULL, 0xaa88002082080308ULL, 0x0ea800804 10a0040ULL,

	3645 0x2a28222000a00000ULL, 0x8aa2808028a0a2a0ULL, 0x0200001000000000ULL, 0x82080000a 0000000ULL,

	3646 0x8800000082000808ULL, 0x2a008a0000300888ULL, 0x0a80080080080808ULL, 0xaa8828008 40b0808ULL,

	3647 0x0a80000080000040ULL, 0xea080820a0000000ULL, 0xaa88080080080808ULL, 0x8040a2800 a8024a0ULL,

	3648 0xaa800020a0080808ULL, 0x0000040000000000ULL, 0x2a280a0080080880ULL, 0x2a2008108 0008a00ULL,

	3649 0x2a88882088aa0008ULL, 0x81800202c0a01480ULL, 0xea88082082200000ULL, 0xaa8800208 0080008ULL,

	3650 0x0000100000000000ULL, 0x802082a22aa0a2a0ULL, 0x2e80000000000000ULL, 0x0220a2a26 aa0a2a8ULL,

	3651 0x800022a2228a22a0ULL, 0x880002212e82c0b0ULL, 0x02a0aa0002a82228ULL, 0x2d808b008 0380008ULL,

	3652 0x0000000000000000ULL, 0x000407551c154244ULL, 0x2a00208088a02228ULL, 0x12a82182a 2402a88ULL,

	3653 0xe32821e020826d00ULL, 0x801130100ccc1330ULL, 0x028010c000841008ULL, 0x88a08002a 0a664a0ULL,

	3654 0x0048270080000100ULL, 0x00001f010cd10f30ULL, 0xe2242ce22aaea2a0ULL, 0xc2c00cc20 ae22460ULL,

	3655 0xe208003128021c10ULL, 0x2a2021c010821080ULL, 0x2a88202082202020ULL, 0x401011110 4941410ULL,

	3656 0xc80c02c182b00080ULL, 0x0000040000000000ULL, 0xe28030068002c300ULL, 0x2aa02024a 2a22228ULL,

	3657 0xe20889328aa22080ULL, 0x0000000000210100ULL, 0xaa0028e0a9b221a0ULL, 0x200000808 0400000ULL,

	3658 0x0000010041150404ULL, 0x0000105114410100ULL, 0xeaa82aa6aaaaaaa8ULL, 0x000000f44 300c434ULL,

	3659 0x0000222222b00020ULL, 0x0000002000000000ULL, 0x0000004014000000ULL, 0x0039b3f73 fbcd3fcULL,

	3660 0x0000000000000000ULL, 0x0000104015045040ULL, 0x20a80490a08800a0ULL, 0x40a825841 0a909a0ULL,

	3661 0xe0a8a2022aa2e2a0ULL, 0xc111010014000500ULL, 0x2080044041840004ULL, 0x28a820022 0a2aba0ULL,

	3662 0x008400a0a2840800ULL, 0x0101015451009464ULL, 0x20000ea0e02c2c2cULL, 0xe2a828a2a ca2aaa8ULL,

	3663 0x682020a228a222a0ULL, 0xe8882ae22aa2a2a0ULL, 0xe9a80e6022a24140ULL, 0x001105500 5001040ULL,

	3664 0x2aa8208229a0aaa4ULL, 0x0000040000000000ULL, 0x28a0228026a62260ULL, 0xe2a020a42 2a2a020ULL,

	3665 0xe808a0022aa1a220ULL, 0x0000010014000100ULL, 0x28ac22802aa2a020ULL, 0x002000000 0000000ULL,

	3666 0x0100010100040000ULL, 0x0000000000000000ULL, 0x22a822a22a8aaaa0ULL, 0x000000000 0000000ULL,

	3667 0x0000102410800100ULL, 0x0000000000000000ULL, 0x0000000002000000ULL, 0x00000fb2a 08c0aa8ULL,

	3668 0x0000000000000000ULL, 0x4010005015440140ULL, 0x18c81c00b180001cULL, 0x280004802 1820800ULL,

	3669 0x8ab820c06a802580ULL, 0x00100170f4040000ULL, 0x4000144041041404ULL, 0x0ac800d00 02e440cULL,

	3670 0x20880820a2000808ULL, 0x400000f03f300c00ULL, 0xaa000ea22aa22aa0ULL, 0xa2880ac0a 8942a20ULL,

	3671 0xaa880a81a1804188ULL, 0xeea022a0aaa02080ULL, 0xaaa820a2aaa66120ULL, 0x000000511 5800150ULL,

	3672 0x2a880920a0840040ULL, 0x0000040000000000ULL, 0xaea82222aaa22a28ULL, 0x8a2804126 0055150ULL,

	3673 0xa28824008aa28880ULL, 0x0000025014019000ULL, 0xea882ae02aa200a0ULL, 0x000000000 0000000ULL,

	3674 0x0000000040000400ULL, 0x0000000000000000ULL, 0xaaa82aa22aaaaaa0ULL, 0x000000000 0000000ULL,

	3675 0x0000000000000000ULL, 0x002003003c80c000ULL, 0x0000020014000000ULL, 0x00200010a 0980a20ULL,

	3676 0x0000000000000000ULL, 0x0020001200801240ULL, 0x0a88000089800020ULL, 0xcaa00080a 1000000ULL,

	3677 0x0a200c0020a04080ULL, 0x4002034003840880ULL, 0x4690500190000050ULL, 0x222800400 0601000ULL,

	3678 0x0a803f00803f400cULL, 0x400033e24dd0cf34ULL, 0xaa80a2a229a220a0ULL, 0x0a2240000 02c0000ULL,

	3679 0x028000202000008cULL, 0x0a08000070000030ULL, 0x00800c040020000cULL, 0x000000000 2850000ULL,

	3680 0x02881cc310200000ULL, 0x0000040004000000ULL, 0xcba8000400000080ULL, 0xcaa02c068 0000000ULL,

	3681 0xcc880002008c4080ULL, 0x300000f007f0cf0cULL, 0x0a80001080a00000ULL, 0x820880802 a880a80ULL,

	3682 0x0000050001040004ULL, 0x0000011000000000ULL, 0x0a8020a2a0202000ULL, 0x000002220 2008000ULL,

	3683 0x0000222212808000ULL, 0x0020226010000000ULL, 0x000033f33ff3c33cULL, 0x00288002a 08c02a8ULL,

	3684 0x0000000000000000ULL, 0x04408e0000008200ULL, 0x0808004000900000ULL, 0x0aa820001 0ca00c0ULL,

	3685 0x0ba80101005d4010ULL, 0x00018604802c8288ULL, 0x00049400101c0000ULL, 0x000c10111 0505010ULL,

	3686 0x0000000000100000ULL, 0x30000c00c022000cULL, 0xd0c00dd0d51d431cULL, 0x000800001 0100000ULL,

	3687 0x000c1001a0280000ULL, 0x0bc80000c0000000ULL, 0x0a00000080280000ULL, 0x8000a0022 0308420ULL,

	3688 0x0808000010301000ULL, 0x0000040000000000ULL, 0x0d00031480100000ULL, 0x072000001 08c0300ULL,

	3689 0x0bc0a0c000004000ULL, 0x8000b002c0208480ULL, 0x340c0100118c111cULL, 0x800800802 0890000ULL,

	3690 0x0000000000040010ULL, 0x0020b00320c1d0b0ULL, 0x00002000000c0000ULL, 0x0020be226 e2008a0ULL,

	3691 0x002010c03fb0a6a0ULL, 0x00202e222aaec284ULL, 0x00008f0000208400ULL, 0x000000000 0300000ULL,

	3692 };

	3693 // Latin1 6%, Latin2 11%, Latin7 3%

	3694

	3695

	3696

	3697 // Just for debugging. not thread-safe

	3698 static char tri_string[4];

	3699 char* Latin127Str(int trisub) {

	3700 tri_string[0] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 10) & 0x1f];

	3701 tri_string[1] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 5) & 0x1f];

	3702 tri_string[2] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 0) & 0x1f];

	3703 tri_string[3] = '\0';

	3704 return tri_string;

	3705 }

	3706

	3707 // Returns two bits per three-byte trigram, indicating

	3708 // dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely

	3709 int TrigramValue(const uint8* trisrc) {

	3710 int byte0_p = kMapToFiveBits[trisrc[0]];

	3711 int byte1_p = kMapToFiveBits[trisrc[1]];

	3712 int byte2_p = kMapToFiveBits[trisrc[2]];

	3713 int subscr = ((byte0_p) << 5) \| byte1_p;

	3714 int temp = static_cast<int>((kLatin127Trigrams[subscr] >> (byte2_p * 2)));

	3715 //printf("%s=%d ", Latin127Str((subscr << 5) \| byte2_p), temp & 3);

	3716 return temp & 3;

	3717 }

	3718

	3719

	3720 // Put out trigrams for surrounding 32 bytes for Latin encodings

	3721 // Return true if more Latin2 & 7 than Latin1

	3722 bool BoostLatin127Trigrams(int tri_block_offset,

	3723 DetectEncodingState* destatep) {

	3724 //printf("BoostLatin127Trigrams[%06x]\n", tri_block_offset);

	3725 int excess_latin27 = 0;

	3726 int srclen = destatep->limit_src - destatep->initial_src;

	3727 int hi_limit = minint(tri_block_offset + 32, srclen - 2);

	3728 const uint8* trisrc = &destatep->initial_src[tri_block_offset];

	3729 const uint8* trisrclimit = &destatep->initial_src[hi_limit];

	3730 while (trisrc < trisrclimit) {

	3731 // Selectively boost Latin1, Latin2, or Latin7 and friends

	3732 int trigram_val = TrigramValue(trisrc);

	3733 if (trigram_val != 0) {

	3734 if (FLAGS_enc_detect_source) {

	3735 PsHighlight(trisrc, destatep->initial_src, trigram_val, 1);

	3736 }

	3737 if (trigram_val == kTriLatin1Likely) {

	3738 Boost(destatep, F_Latin1, kTrigramBoost);

	3739 Boost(destatep, F_CP1252, kTrigramBoost);

	3740 // We don't want to upset the relative rank of a declared 8859-15

	3741 Boost(destatep, F_ISO_8859_15, kTrigramBoost);

	3742 --excess_latin27;

	3743 } else if (trigram_val == kTriLatin2Likely) {

	3744 Boost(destatep, F_Latin2, kTrigramBoost);

	3745 Boost(destatep, F_CP1250, kTrigramBoost);

	3746 ++excess_latin27;

	3747 } else if (trigram_val == kTriLatin7Likely) {

	3748 Boost(destatep, F_ISO_8859_13, kTrigramBoost);

	3749 Boost(destatep, F_CP1257, kTrigramBoost);

	3750 // We don't want to upset the relative rank of a declared 8859-4 or -6

	3751 // for Estonian

	3752 Boost(destatep, F_Latin4, kTrigramBoost);

	3753 Boost(destatep, F_Latin6, kTrigramBoost);

	3754 ++excess_latin27;

	3755 }

	3756 }

	3757

	3758 ++trisrc;

	3759 }

	3760 //printf("\n");

	3761

	3762 return (0 < excess_latin27);

	3763 }

	3764

	3765

	3766

	3767 // Boost any encodings that need extra detection help, then prune

	3768 // src is first unscanned byte

	3769 // slowend means extra pruning when dropping out of initial slow scan

	3770 // final means last call -- no bigram at src

	3771 void BoostPrune(const uint8* src, DetectEncodingState* destatep,

	3772 int prunereason) {

	3773 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -

	3774 destatep->prior_interesting_pair[AsciiPair];

	3775 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -

	3776 destatep->prior_interesting_pair[OtherPair];

	3777

	3778 if (prunereason == PRUNE_FINAL) {

	3779 // We are about done

	3780 // If we get here with very little accumulated data, the initial hints

	3781 // were too strong, so we derate them to n+1 / 12 for n bigrams

	3782 if (!destatep->hints_derated &&

	3783 (destatep->next_interesting_pair[OtherPair] < kDerateHintsBelow)) {

	3784 int n = destatep->next_interesting_pair[OtherPair];

	3785

	3786 // Map N pairs to (N+1)/12 portions of the initial hints, etc.

	3787 // Floor of 3/12 -- 1/12 and 2/12 are too easy to overcome

	3788 int m = maxint(3, (n + 1));

	3789 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {

	3790 int original_delta = destatep->hint_prob[i];

	3791 int scaled_delta = (original_delta * m) / kDerateHintsBelow;

	3792 destatep->enc_prob[i] -= original_delta;

	3793 destatep->enc_prob[i] += scaled_delta;

	3794 }

	3795 destatep->hints_derated = true;

	3796 if (destatep->debug_data != NULL) {

	3797 // Show derated-hint result

	3798 char buff[32];

	3799 snprintf(buff, sizeof(buff), "Hints %d/%d", m, kDerateHintsBelow);

	3800 SetDetailsEncLabel(destatep, buff);

	3801 }

	3802 }

	3803 }

	3804

	3805

	3806 ++destatep->prune_count;

	3807

	3808 if (prunereason != PRUNE_FINAL) {

	3809 // Early outs

	3810 if (destatep->rankedencoding_list_len <= 1) { // nothing to prune

	3811 destatep->done = true;

	3812 return;

	3813 }

	3814

	3815 if ((destatep->prune_count > 0) &&

	3816 (delta_asciipairs + delta_otherpairs) == 0) {

	3817 // Nothing to do; must have just been called earlier

	3818 return;

	3819 }

	3820 }

	3821

	3822

	3823

	3824 // INCREMENT

	3825 // ====================

	3826 // Accumulate OtherPair probibilities over all active families

	3827 // AsciiPair probibilities are all done in ActiveSpecialBoostWhack

	3828 uint8 prior_bad_byte1 = ' '; // won't match first bad pair

	3829 uint8 prior_bad_byte2 = ' '; // won't match first bad pair

	3830 uint8 or_byte1 = 0; // Track if any current pair has a high bit

	3831 int counted_otherpairs = 0;

	3832 uint8 prior_byte1x2x = 0;

	3833 for (int i = 0; i < delta_otherpairs; ++i) {

	3834 int watch1_incr = 0;

	3835 int watch2_incr = 0;

	3836 int next_pair = destatep->prior_interesting_pair[OtherPair] + i;

	3837

	3838 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];

	3839 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];

	3840 uint8 byte1x2x = (byte1 & 0xf0) \| ((byte2 >> 4) & 0x0f);

	3841 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];

	3842

	3843 int offset_byte12 = destatep->interesting_offsets[OtherPair][next_pair];

	3844

	3845 // To help distinguish some Cyrillic, Arabic, Greek, Hebrew, Thai

	3846 // Remember if this is a CDEF pair immediately following the previous pair

	3847 // 8xxx CxCx or CxCx 8xxx

	3848 bool next_pair_consec_hi = false;

	3849 if (ConsecutivePair(destatep, next_pair)) {

	3850 if ((byte1x2x & 0xcc) == 0xcc) { // 8xxx CxCx

	3851 next_pair_consec_hi = true;

	3852 } else if ((prior_byte1x2x & 0xcc) == 0xcc) { // CxCx 8xxx

	3853 next_pair_consec_hi = true;

	3854 }

	3855 }

	3856 //printf("prior/cur/consec %02x %02x %d\n",

	3857 // prior_byte1x2x, byte1x2x, next_pair_consec_hi);

	3858 prior_byte1x2x = byte1x2x;

	3859

	3860 or_byte1 \|= byte1;

	3861 uint8 byte1f = byte1;

	3862 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)

	3863 byte1f ^= (byte2 & 0x80);

	3864

	3865 // If the same bigram occurred recently, don't increment again

	3866 bool pair_used = false;

	3867 if (!RepeatedBigram(destatep, byte1, byte2)) {

	3868 ++counted_otherpairs;

	3869 pair_used = true;

	3870 // Boost both charset= declared encodings, so

	3871 // Nearly-same probability nearby encoding doesn't drift to the top

	3872 if (!FLAGS_demo_nodefault) {

	3873 destatep->enc_prob[destatep->declared_enc_1] += kDeclaredEncBoost >> wei ghtshift;

	3874 destatep->enc_prob[destatep->declared_enc_2] += kDeclaredEncBoost >> wei ghtshift;

	3875 }

	3876 bool was_bad_pair = false;

	3877 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	3878 int incr_shift = 0;

	3879 int rankedencoding = destatep->rankedencoding_list[j];

	3880 Encoding enc = kMapToEncoding[rankedencoding];

	3881

	3882 // For binary, Skip over repeated marker bytes, such as 02, FF, etc.

	3883 if ((rankedencoding == F_BINARY) &&

	3884 RepeatedBinary(destatep, byte1, byte2)) {

	3885 incr_shift = 2; // count 1/4 as much if repeated

	3886 }

	3887

	3888 // If byte 1x2x for this encoding is exactly zero, illegal byte pair

	3889 // Don't increment, but instead penalize

	3890 const UnigramEntry* ue = &unigram_table[rankedencoding];

	3891 if (ue->b12[byte1x2x] == 0) {

	3892 // Don't whack consecutive duplicate bad pairs -- overkill

	3893 if ((byte1 != prior_bad_byte1) \|\| (byte2 != prior_bad_byte2)) {

	3894 // Extra whack for illegal pair in this encoding

	3895 Whack(destatep, rankedencoding, kBadPairWhack >> weightshift);

	3896 was_bad_pair = true;

	3897 }

	3898 } else {

	3899 // OK to do the real increment

	3900 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];

	3901 if ((ue->b12[byte1x2x] & 0x01) != 0) {

	3902 // Use a more-precise table

	3903 int byte32x32 = ((byte1 & 0x1f) << 5) \| (byte2 & 0x1f);

	3904 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2

	3905 DCHECK(ue->hires[hiressub] != NULL);

	3906 incr += ue->hires[hiressub][byte32x32];

	3907 } else {

	3908 // Default final offset

	3909 incr += ue->so;

	3910 }

	3911 incr >>= incr_shift;

	3912

	3913 incr >>= weightshift;

	3914 destatep->enc_prob[rankedencoding] += incr; // The actual increment

	3915

	3916 if (FLAGS_enc_detect_detail2) {

	3917 if (watch1_rankedenc == rankedencoding) {watch1_incr = incr;}

	3918 if (watch2_rankedenc == rankedencoding) {watch2_incr = incr;}

	3919 }

	3920 }

	3921

	3922

	3923 // If consecutive pair of high bytes, give slight boost to one-byte

	3924 // encodings that have a full alphabet in the high bytes

	3925 if (next_pair_consec_hi && HighAlphaEncoding(enc)) {

	3926 Boost(destatep, rankedencoding, kDeclaredEncBoost >> weightshift);

	3927 }

	3928 } // End for j < rankedencoding_list_len

	3929

	3930 if (was_bad_pair) {

	3931 prior_bad_byte1 = byte1;

	3932 prior_bad_byte2 = byte2;

	3933 }

	3934

	3935 // Fold in per-bigram most likely encoding for first N bigrams

	3936 if (next_pair < kBestPairsCount) {

	3937 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];

	3938 Boost(destatep, best_enc, kBestEncBoost >> weightshift);

	3939 }

	3940

	3941 // Possibly score 32 trigrams around a bigram to better separate

	3942 // Latin1 from Latin2 and Latin7. Especially helpful for detecting

	3943 // mis-labelled Hungarian latin2.

	3944 // If looking and at bigram 0,8,16,... do full scoring, else just 1 tri

	3945 if (destatep->do_latin_trigrams \|\|

	3946 destatep->looking_for_latin_trigrams) {

	3947 // If just looking, do full scan every 8 times

	3948 // Just look up one trigram the other 7 and do full scan if Latin2,7

	3949 bool scan32 = false;

	3950 const uint8* trisrc = &destatep->initial_src[offset_byte12 - 1];

	3951 if (!destatep->do_latin_trigrams) {

	3952 if ((i & 7) == 0 \|\| trisrc + 3 > destatep->limit_src) {

	3953 scan32 = true;

	3954 } else {

	3955 scan32 = (kTriLatin1Likely < TrigramValue(trisrc));

	3956 }

	3957 }

	3958 if (destatep->do_latin_trigrams \|\| scan32) {

	3959 // Just score each block of 32 bytes once

	3960 int tri_block_offset = offset_byte12 & ~0x1f;

	3961 if (destatep->trigram_highwater_mark <= tri_block_offset) {

	3962 bool turnon = BoostLatin127Trigrams(tri_block_offset, destatep);

	3963 if (FLAGS_counts && !destatep->do_latin_trigrams && turnon) {

	3964 ++doing_used; // First time

	3965 }

	3966 if (FLAGS_enc_detect_source) {

	3967 if (!destatep->do_latin_trigrams && turnon) {

	3968 // First time

	3969 PsHighlight(trisrc, destatep->initial_src, 0, 2);

	3970 }

	3971 }

	3972 destatep->do_latin_trigrams \|= turnon;

	3973 destatep->trigram_highwater_mark = tri_block_offset + 32;

	3974 }

	3975 }

	3976 }

	3977

	3978 } // end if RepeatedBigram()

	3979

	3980 // Keep track of initial byte high 3 bits

	3981 ++destatep->byte32_count[byte1 >> 5];

	3982

	3983

	3984 // TODO: boost subset/superset also

	3985 // Boost(destatep, kRelatedEncoding[best_enc], kBestEncBoost);

	3986

	3987 if (destatep->debug_data != NULL) {

	3988 // Show detail entry for this bigram

	3989 char buff[16];

	3990 snprintf(buff, sizeof(buff), "%c%02x%02x%c%c",

	3991 pair_used ? ' ' : '[',

	3992 byte1,

	3993 byte2,

	3994 pair_used ? ' ' : ']',

	3995 (weightshift == 0) ? ' ' : '-');

	3996

	3997 SetDetailsEncProb(destatep,

	3998 destatep->interesting_offsets[OtherPair][next_pair],

	3999 kMostLikelyEncoding[(byte1 << 8) + byte2],

	4000 buff);

	4001 }

	4002 if (FLAGS_enc_detect_detail2) {

	4003 if ((watch1_incr != 0) \|\| (watch2_incr != 0)) {

	4004 // Show increment detail for this encoding

	4005 char buff[32];

	4006 snprintf(buff, sizeof(buff), "%c%d %c%d",

	4007 (watch1_incr < 0) ? '-' : '+', watch1_incr,

	4008 (watch2_incr < 0) ? '-' : '+', watch2_incr);

	4009 SetDetailsEncLabel(destatep, buff);

	4010 }

	4011 }

	4012 } // End for i

	4013

	4014

	4015 // If no high bit on, demote all the two-byte codes

	4016 // WAS BUG. This was inside the loop above and should be outside

	4017 if ((counted_otherpairs > 0) && ((or_byte1 & 0x80) == 0)) {

	4018 // No high bit in this group (just 02xx, etc.). Whack 2-byte codes

	4019 // This keeps SJS from creeping past Latin1 on illegal C0 bytes

	4020 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	4021 int rankedencoding = destatep->rankedencoding_list[j];

	4022 Encoding enc = kMapToEncoding[rankedencoding];

	4023 if (TwoByteEncoding(enc)) {

	4024 Whack(destatep, rankedencoding, kGentlePairWhack * counted_otherpairs);

	4025 }

	4026 }

	4027 }

	4028

	4029

	4030 // BOOST

	4031 // ====================

	4032 if (AnyActive(destatep)) {

	4033 ActiveSpecialBoostWhack(src, destatep);

	4034 }

	4035

	4036 // Update for next time

	4037 destatep->prior_src = src;

	4038 destatep->prior_interesting_pair[AsciiPair] =

	4039 destatep->next_interesting_pair[AsciiPair];

	4040 destatep->prior_interesting_pair[OtherPair] =

	4041 destatep->next_interesting_pair[OtherPair];

	4042

	4043

	4044 // Do any pre-prune final adjustments

	4045 // ====================

	4046 if (prunereason == PRUNE_FINAL) {

	4047 // If UTF8 not in base state, whack

	4048 if (destatep->next_utf8_ministate != 0) {

	4049 Whack(destatep, F_UTF8, kGentlePairWhack * 2 * 1);

	4050 }

	4051 // If UTF8UTF8 not in base state, whack

	4052 if (destatep->next_utf8utf8_ministate != 0) {

	4053 Whack(destatep, F_UTF8UTF8, kGentlePairWhack * 2 * 1);

	4054 }

	4055

	4056 // If no valid UTF-8 char ever seen, whack

	4057 if (destatep->utf8_minicount[5] == 0) {

	4058 Whack(destatep, F_UTF8, kBadPairWhack * 8); // No sequence

	4059 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence

	4060 }

	4061

	4062 // If no valid UTF8UTF8 char ever seen, whack

	4063 if (destatep->utf8utf8_minicount[5] == 0) {

	4064 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence

	4065 }

	4066

	4067 // If not all four binary quadrants, whack BINARY;

	4068 // worth 2 pair if 3 quads, 4 pair if 1 or 2 quads

	4069 if (destatep->binary_quadrants_count < 4) {

	4070 if (destatep->binary_quadrants_count == 3) {

	4071 Whack(destatep, F_BINARY, kBadPairWhack * 2);

	4072 } else {

	4073 Whack(destatep, F_BINARY, kBadPairWhack * 4);

	4074 }

	4075 }

	4076

	4077 // If 1st pair is 1b24, choose between ISO-2022-xx

	4078 // <esc> $ ) C ISO-2022-KR [1b 24 29 43]

	4079 // <esc> $ ) A ISO-2022-CN [1b 24 29 41]

	4080 // <esc> $ ) G ISO-2022-CN [1b 24 29 47]

	4081 // <esc> $ * H ISO-2022-CN [1b 24 2a 48]

	4082 // <esc> ( B ISO-2022-JP [1b 28 42] to ASCII

	4083 // <esc> ( J ISO-2022-JP [1b 28 4a] to X0201

	4084 // <esc> $ @ ISO-2022-JP [1b 24 40] to X0208-78 twobyte

	4085 // <esc> $ B ISO-2022-JP [1b 24 42] to X0208-83 twobyte

	4086 if ((destatep->next_interesting_pair[OtherPair] >= 1) &&

	4087 Iso2022Active(destatep)) {

	4088 if ((destatep->interesting_pairs[OtherPair][0] == 0x1b) &&

	4089 (destatep->interesting_pairs[OtherPair][1] == 0x24)) {

	4090 int offset = destatep->interesting_offsets[OtherPair][0];

	4091 const uint8* esc_src = destatep->initial_src + offset;

	4092 if ((destatep->initial_src + offset) < (destatep->limit_src - 3)) {

	4093 if ((esc_src[2] == ')') && (esc_src[3] == 'C')) {

	4094 Boost(destatep, F_ISO_2022_KR, kBoostOnePair);

	4095 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);

	4096 Whack(destatep, F_JIS, kBadPairWhack);

	4097 } else if ((esc_src[2] == ')') && ((esc_src[3] == 'A') \|\|

	4098 (esc_src[3] == 'G'))) {

	4099 Boost(destatep, F_ISO_2022_CN, kBoostOnePair);

	4100 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);

	4101 Whack(destatep, F_JIS, kBadPairWhack);

	4102 } else if ((esc_src[2] == '@') \|\| (esc_src[2] == 'B')) {

	4103 Boost(destatep, F_JIS, kBoostOnePair);

	4104 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);

	4105 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);

	4106 }

	4107 } else {

	4108 // Incomplete escape sequence. Whack them all

	4109 Whack(destatep, F_JIS, kBadPairWhack);

	4110 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);

	4111 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);

	4112 }

	4113 }

	4114 }

	4115 if (destatep->debug_data != NULL) {

	4116 SetDetailsEncLabel(destatep, "pre-final");

	4117 }

	4118 }

	4119

	4120 // PRUNE

	4121 // ====================

	4122 // Find current top two rankedencoding probabilities

	4123 ReRank(destatep);

	4124

	4125 if (prunereason == PRUNE_SLOWEND) {

	4126 if (destatep->debug_data != NULL) {

	4127 SetDetailsEncLabel(destatep, "slow-end");

	4128 }

	4129 }

	4130

	4131 // Keep every rankedencoding with probablity >= top_prob - prune_difference

	4132 int prune_diff = destatep->prune_difference;

	4133 // If the top encoding is BINARY, it might be overstated, and we might

	4134 // therefore prune away the real encoding. Make the pruning delta

	4135 // twice as big.

	4136 if (destatep->top_rankedencoding == F_BINARY) {

	4137 prune_diff *= 2;

	4138 }

	4139 int keep_prob = destatep->top_prob - prune_diff;

	4140

	4141 // Tighten pruning difference (we start wide) for next time

	4142 if (destatep->prune_difference > kFinalPruneDifference) {

	4143 int decrement = kPruneDiffDecrement;

	4144 // If only ASCII pairs, small tighten; if some non-ASCII, full tighten

	4145 if (counted_otherpairs == 0) {

	4146 decrement >>= 1;

	4147 }

	4148 destatep->prune_difference -= decrement;

	4149 }

	4150

	4151 // Prune the list of active encoding families

	4152 destatep->active_special = 0;

	4153 int k = 0;

	4154 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	4155 bool keep = true;

	4156 int rankedencoding = destatep->rankedencoding_list[j];

	4157

	4158 // If count is too low, ditch it

	4159 if (destatep->enc_prob[rankedencoding] < keep_prob) {

	4160 keep = false;

	4161 }

	4162

	4163 // If at end of slow section, ditch any 7-bit with zero evidence so far

	4164 if ((prunereason == PRUNE_SLOWEND) &&

	4165 SevenBitEncoding(kMapToEncoding[rankedencoding]) &&

	4166 (destatep->enc_prob[rankedencoding] <= 0) &&

	4167 (rankedencoding != destatep->top_rankedencoding)) {

	4168 keep = false;

	4169 }

	4170

	4171 // Keep it. This will always keep at least top_prob rankedencoding

	4172 if (keep) {

	4173 destatep->active_special \|= kSpecialMask[kMapToEncoding[rankedencoding]];

	4174 destatep->rankedencoding_list[k++] = rankedencoding;

	4175 }

	4176 }

	4177

	4178 if (destatep->debug_data != NULL) {

	4179 char buff[32];

	4180 snprintf(buff, sizeof(buff), "%d prune", prune_diff / XLOG2);

	4181 SetDetailsEncLabel(destatep, buff);

	4182 }

	4183 destatep->rankedencoding_list_len = k;

	4184

	4185

	4186

	4187 // Force final result in some cases

	4188 // Do any post-prune final adjustments

	4189 if (prunereason == PRUNE_FINAL) {

	4190 // If no high-byte pairs, result is ASCII7, BINARY, UTF7, 2022, or HZ

	4191 if (destatep->next_interesting_pair[OtherPair] == 0) {

	4192 if ((destatep->top_rankedencoding != F_BINARY) &&

	4193 (destatep->top_rankedencoding != F_UTF7) &&

	4194 (destatep->top_rankedencoding != F_ISO_2022_CN) &&

	4195 (destatep->top_rankedencoding != F_ISO_2022_KR) &&

	4196 (destatep->top_rankedencoding != F_JIS) &&

	4197 (destatep->top_rankedencoding != F_HZ_GB_2312)) {

	4198 destatep->top_rankedencoding = F_ASCII_7_bit;

	4199 Boost(destatep, F_ASCII_7_bit, kBoostOnePair * 2);

	4200 }

	4201 }

	4202

	4203 // If some 89 pairs, not ISO_8859_x and vice versa

	4204 if (destatep->byte32_count[4] > 0) {

	4205 switch (destatep->top_rankedencoding) {

	4206 case F_ASCII: // ISO-8859-1

	4207 destatep->top_rankedencoding = F_CP1252;

	4208 // Better: destatep->enc_prob[F_ASCII] <==> destatep->enc_prob[F_CP1252]

	4209 Boost(destatep, F_CP1252, kBoostOnePair * 2);

	4210 break;

	4211 case F_Latin2: // ISO-8859-2

	4212 // Don't swap back; not superset

	4213 //destatep->top_rankedencoding = F_CP1250;

	4214 //Boost(destatep, F_CP1250, kBoostOnePair * 2);

	4215 break;

	4216 case F_Arabic: // ISO-8859-6

	4217 destatep->top_rankedencoding = F_CP1256;

	4218 Boost(destatep, F_CP1256, kBoostOnePair * 2);

	4219 break;

	4220 case F_Greek: // ISO-8859-7

	4221 // Don't swap -- not proper superset

	4222 // Capital Alpha tonos at 0xB6 in ISO-8859-7, 0xA2 in CP1253

	4223 //destatep->top_rankedencoding = F_CP1253;

	4224 //Boost(destatep, F_CP1253, kBoostOnePair * 2);

	4225 break;

	4226 case F_Hebrew: // ISO-8859-8

	4227 // Don't swap -- visual vs. logical

	4228 //destatep->top_rankedencoding = F_CP1255;

	4229 //Boost(destatep, F_CP1255, kBoostOnePair * 2);

	4230 break;

	4231 case F_Latin5: // ISO-8859-9

	4232 destatep->top_rankedencoding = F_CP1254;

	4233 Boost(destatep, F_CP1254, kBoostOnePair * 2);

	4234 break;

	4235 case F_ISO_8859_11: // ISO-8859-11

	4236 destatep->top_rankedencoding = F_CP874;

	4237 Boost(destatep, F_CP874, kBoostOnePair * 2);

	4238 break;

	4239 }

	4240 } else {

	4241 switch (destatep->top_rankedencoding) {

	4242 case F_CP1252: // ISO-8859-1

	4243 destatep->top_rankedencoding = F_ASCII;

	4244 Boost(destatep, F_ASCII, kBoostOnePair * 2);

	4245 break;

	4246 case F_CP1250: // ISO-8859-2

	4247 // Don't swap back; not superset

	4248 //destatep->top_rankedencoding = F_Latin2;

	4249 //Boost(destatep, F_Latin2, kBoostOnePair * 2);

	4250 break;

	4251 case F_CP1256: // ISO-8859-6

	4252 // Don't swap back -- not proper superset

	4253 //destatep->top_rankedencoding = F_Arabic;

	4254 //Boost(destatep, F_Arabic, kBoostOnePair * 2);

	4255 break;

	4256 case F_CP1253: // ISO-8859-7

	4257 // Don't swap back -- not proper superset

	4258 //destatep->top_rankedencoding = F_Greek;

	4259 //Boost(destatep, F_Greek, kBoostOnePair * 2);

	4260 break;

	4261 case F_CP1255: // ISO-8859-8

	4262 // Don't swap back -- not proper superset

	4263 //destatep->top_rankedencoding = F_Hebrew;

	4264 //Boost(destatep, F_Hebrew, kBoostOnePair * 2);

	4265 break;

	4266 case F_CP1254: // ISO-8859-9

	4267 destatep->top_rankedencoding = F_Latin5;

	4268 Boost(destatep, F_Latin5, kBoostOnePair * 2);

	4269 break;

	4270 case F_CP874: // ISO-8859-11

	4271 destatep->top_rankedencoding = F_ISO_8859_11;

	4272 Boost(destatep, F_ISO_8859_11, kBoostOnePair * 2);

	4273 break;

	4274 }

	4275 }

	4276

	4277 if (destatep->debug_data != NULL) {

	4278 char buff[32];

	4279 snprintf(buff, sizeof(buff), "final %d",

	4280 static_cast<int>(src - destatep->initial_src));

	4281 SetDetailsEncLabel(destatep, buff);

	4282

	4283 // Show winning encoding and its delta log base2 from 2nd-best

	4284 // Divide delta by XLOG2 to get log base 2

	4285 int delta = destatep->top_prob - destatep->second_top_prob;

	4286 if (delta < (2 * XLOG2)) {

	4287 delta /= XDECILOG2;

	4288 snprintf(buff, sizeof(buff), "+%d.%d %s ",

	4289 delta / 10, delta % 10,

	4290 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));

	4291 } else if (delta < (50 * XLOG2)) {

	4292 delta /= XLOG2;

	4293 snprintf(buff, sizeof(buff), "+%d %s",

	4294 delta,

	4295 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));

	4296 } else {

	4297 snprintf(buff, sizeof(buff), "%s",

	4298 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));

	4299 }

	4300 SetDetailsEncProbCopyOffset(destatep, destatep->top_rankedencoding, buff);

	4301 }

	4302 }

	4303

	4304

	4305 // FINISH

	4306 // ====================

	4307 // Eventual encoding result is reliable if big difference in top two, or if

	4308 // only Ascii7 ever encountered

	4309 // Also reliable if exactly one OtherPair and it's best encoding matches top

	4310 destatep->reliable = false;

	4311 if (destatep->next_interesting_pair[OtherPair] == 0) {

	4312 // Only 7-bit ASCII

	4313 destatep->reliable = true;

	4314 }

	4315 if ((destatep->top_prob - destatep->second_top_prob) >=

	4316 FLAGS_ced_reliable_difference) {

	4317 destatep->reliable = true;

	4318 }

	4319 if (destatep->next_interesting_pair[OtherPair] == 1) {

	4320 uint8 byte1 = destatep->interesting_pairs[OtherPair][0];

	4321 uint8 byte2 = destatep->interesting_pairs[OtherPair][1];

	4322 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];

	4323 if (best_enc == destatep->top_rankedencoding) {

	4324 destatep->reliable = true;

	4325 }

	4326 }

	4327

	4328 // If we pruned to one encoding, we are done

	4329 if (destatep->rankedencoding_list_len == 1) {

	4330 destatep->reliable = true;

	4331 destatep->done = true;

	4332 }

	4333

	4334 // If we pruned to two or three encodings in the same *superset/subset

	4335 // rankedencoding* and enough pairs, we are done. Else keep going

	4336 if (destatep->rankedencoding_list_len == 2) {

	4337 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];

	4338 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];

	4339 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {

	4340 if (destatep->prune_count >= 3) {

	4341 destatep->reliable = true;

	4342 destatep->done = true;

	4343 }

	4344 }

	4345 } else if (destatep->rankedencoding_list_len == 3) {

	4346 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];

	4347 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];

	4348 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];

	4349 Encoding base0 = kMapEncToBaseEncoding[enc0];

	4350 Encoding base1 = kMapEncToBaseEncoding[enc1];

	4351 Encoding base2 = kMapEncToBaseEncoding[enc2];

	4352

	4353 if ((base0 == base1) && (base0 == base2)) {

	4354 if (destatep->prune_count >= 3) {

	4355 destatep->reliable = true;

	4356 destatep->done = true;

	4357 }

	4358 }

	4359 }

	4360 }

	4361

	4362

	4363 // Accumulate aligned byte-pair at src

	4364 // Occasionally, calc boost for some encodings and then prune the active list

	4365 // weightshift is used to give low weight some text, such as inside tags

	4366 // Returns true if pruning occurred

	4367 bool IncrementAndBoostPrune(const uint8* src,

	4368 int remaining_length,

	4369 DetectEncodingState* destatep,

	4370 int weightshift,

	4371 int exit_reason) {

	4372 destatep->last_pair = src;

	4373 // Pick up byte pair, or very last byte plus 0x20

	4374 uint8 byte1 = src[0];

	4375 uint8 byte2 = 0x20;

	4376 if (1 < remaining_length) {byte2 = src[1];}

	4377

	4378 // whatset=0 for Ascii + ~, 1 for all others; see kTestPrintableAsciiTildePlus

	4379 int whatset = exit_reason - 1;

	4380 int next_pair = destatep->next_interesting_pair[whatset];

	4381

	4382 if (next_pair > 16) {

	4383 // If not clear by 16 bigrams, stop accumulating + ~ 00

	4384 if (byte1 == '+') {return false;}

	4385 if (byte1 == '~') {return false;}

	4386 if (byte1 == 0x00) {return false;}

	4387 }

	4388

	4389 // Remember pair in appropriate list

	4390 if (next_pair >= kMaxPairs) {

	4391 // We have filled up our alloted space for interesting pairs with no

	4392 // decision. If ASCII pairs full, just skip until end of slow loop; if

	4393 // non-Ascii pairs full, force done

	4394 if (whatset == OtherPair) {

	4395 destatep->done = true;

	4396 }

	4397 } else {

	4398 int offset = static_cast<int>(src - destatep->initial_src);

	4399 destatep->interesting_pairs[whatset][next_pair * 2 + 0] = byte1;

	4400 destatep->interesting_pairs[whatset][next_pair * 2 + 1] = byte2;

	4401 destatep->interesting_offsets[whatset][next_pair] = offset;

	4402 destatep->interesting_weightshift[whatset][next_pair] = weightshift;

	4403 ++destatep->next_interesting_pair[whatset];

	4404 ++next_pair;

	4405 }

	4406

	4407 // Prune now and then , but always if forced to be done

	4408 if (destatep->done \|\| ((next_pair & kPruneMask) == 0)) { // Prune every M

	4409 BoostPrune(src + 2, destatep, PRUNE_NORMAL); // src+2 first unscanned byte

	4410 // may be off end of input

	4411 return true;

	4412 }

	4413 return false;

	4414 }

	4415

	4416 void DumpSummary(DetectEncodingState* destatep, int whatset, int n) {

	4417 printf(" %sSummary[%2d]: ", kWhatSetName[whatset],

	4418 destatep->next_interesting_pair[whatset]);

	4419 int limit = minint(n, destatep->next_interesting_pair[whatset]);

	4420 for (int i = 0; i < limit; ++i) {

	4421 printf("%02x%02x ",

	4422 destatep->interesting_pairs[whatset][i * 2 + 0],

	4423 destatep->interesting_pairs[whatset][i * 2 + 1]);

	4424 if ((i & 7) == 7) {printf(" ");}

	4425 }

	4426 printf("\n");

	4427 }

	4428

	4429 void BeginDetail(DetectEncodingState* destatep) {

	4430 fprintf(stderr, "%d [", NUM_RANKEDENCODING);

	4431 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {

	4432 fprintf(stderr, "(%s)", MyRankedEncName(e));

	4433 if ((e % 10) == 9) {fprintf(stderr, "\n ");}

	4434 }

	4435 fprintf(stderr, "] size-detail\n");

	4436 destatep->next_detail_entry = 0;

	4437 }

	4438

	4439 // Single character to represent (printable ASCII) gap between bigrams

	4440 char DetailOffsetChar(int delta) {

	4441 if (delta == 0) {return ' ';}

	4442 if (delta <= 2) {return '=';}

	4443 if (delta <= 15) {return '_';}

	4444 if (delta <= 31) {return '+';}

	4445 {return ' ';}

	4446 }

	4447

	4448 void DumpDetail(DetectEncodingState* destatep) {

	4449 // Turn all counts into delta from previous entry

	4450 fprintf(stderr, "%d count-detail\n", destatep->next_detail_entry);

	4451 // Rewrite, recording deltas

	4452 for (int z = destatep->next_detail_entry - 1; z > 0; --z) {

	4453 destatep->debug_data[z].offset -= destatep->debug_data[z - 1].offset;

	4454 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {

	4455 destatep->debug_data[z].detail_enc_prob[e] -=

	4456 destatep->debug_data[z - 1].detail_enc_prob[e];

	4457 }

	4458 }

	4459 // Now print

	4460 for (int z = 0; z < destatep->next_detail_entry; ++z) {

	4461 // Highlight some entries ending in '!' with light red underbar

	4462 int len = destatep->debug_data[z].label.size();

	4463 if (destatep->debug_data[z].label[len - 1] == '!') {

	4464 fprintf(stderr, "1 0.9 0.9 do-flag\n");

	4465 }

	4466 fprintf(stderr, "(%c%s) %d [",

	4467 DetailOffsetChar(destatep->debug_data[z].offset),

	4468 destatep->debug_data[z].label.c_str(),

	4469 destatep->debug_data[z].best_enc);

	4470 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {

	4471 fprintf(stderr, "%d ", destatep->debug_data[z].detail_enc_prob[e]);

	4472 if ((e % 10) == 9) {fprintf(stderr, " ");}

	4473 }

	4474 fprintf(stderr, "] do-detail-e\n");

	4475 }

	4476 // Get ready for next time,if any

	4477 destatep->next_detail_entry = 0;

	4478 }

	4479

	4480 void PsRecurse(const char* buff) {

	4481 fprintf(stderr, "() end-detail (%s) start-detail\n\n", buff);

	4482 }

	4483

	4484 void DumpReliable(DetectEncodingState* destatep) {

	4485 printf("Not reliable: ");

	4486

	4487 // Find center of gravity of OtherPair list

	4488 int x_sum = 0;

	4489 int y_sum = 0;

	4490 int count = destatep->next_interesting_pair[OtherPair];

	4491 for (int i = 0; i < count; ++i) {

	4492 uint8 byte1 = destatep->interesting_pairs[OtherPair][i * 2 + 0];

	4493 uint8 byte2 = destatep->interesting_pairs[OtherPair][i * 2 + 1];

	4494 x_sum += byte2;

	4495 y_sum += byte1;

	4496 }

	4497 if (count == 0) {count = 1;} // adoid zdiv

	4498 int x_bar = x_sum / count;

	4499 int y_bar = y_sum / count;

	4500 printf("center %02X,%02X\n", x_bar, y_bar);

	4501

	4502 double closest_dist = 999.0;

	4503 int closest = 0;

	4504 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	4505 int rankedencoding = destatep->rankedencoding_list[j];

	4506 const UnigramEntry* ue = &unigram_table[rankedencoding];

	4507 printf(" %8s = %4d at %02x,%02x +/- %02X,%02X ",

	4508 MyEncodingName(kMapToEncoding[rankedencoding]),

	4509 destatep->enc_prob[rankedencoding],

	4510 ue->x_bar, ue->y_bar,

	4511 ue->x_stddev, ue->y_stddev);

	4512 double x_diff = x_bar - ue->x_bar;

	4513 double y_diff = y_bar - ue->y_bar;

	4514 double dist = sqrt((x_diff * x_diff) + (y_diff * y_diff));

	4515 printf("(%3.1f)\n", dist);

	4516

	4517 if (closest_dist > dist) {

	4518 closest_dist = dist;

	4519 closest = rankedencoding;

	4520 }

	4521 }

	4522 printf("Closest=%s (%3.1f)\n",

	4523 MyEncodingName(kMapToEncoding[closest]), closest_dist);

	4524

	4525 for (int i = 0; i < 8; ++i) {

	4526 // Demote by distance to CG and see if that helps, or just quit

	4527 }

	4528 }

	4529

	4530 // Scan short single lines quickly for all printable ASCII

	4531 // Return true if all bytes are in [20..7F], false otherwise

	4532 bool QuickPrintableAsciiScan(const char* text, int text_length) {

	4533 const uint8* src = reinterpret_cast<const uint8*>(text);

	4534 const uint8* srclimit = src + text_length;

	4535 const uint8* srclimit8 = srclimit - 7;

	4536 while (src < srclimit8) {

	4537 const uint32* s = reinterpret_cast<const uint32*>(src);

	4538 uint32 tmp1 = s[0];

	4539 uint32 tmp2 = s[1];

	4540 src += 8;

	4541 // Exits on any byte outside [0x20..0x7E] range (HT LF CR exit)

	4542 uint32 byte_outside_range_mask = ((tmp1 - 0x20202020U) \|

	4543 (tmp1 + 0x01010101U) \|

	4544 (tmp2 - 0x20202020U) \|

	4545 (tmp2 + 0x01010101U));

	4546 if ((byte_outside_range_mask & 0x80808080U) != 0) {

	4547 src -= 8;

	4548 break;

	4549 }

	4550 }

	4551 while (src < srclimit) {

	4552 uint8 uc = *src++;

	4553 if (kIsPrintableAscii[uc] == 0) {return false;}

	4554 }

	4555 return true;

	4556 }

	4557

	4558 static const int kMaxScanBack = 192;

	4559 static const int kMaxScanForward = 64;

	4560

	4561 // Return true if text is inside a tag or JS comment

	4562 bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {

	4563 const uint8* srcbacklimit = src - kMaxScanBack;

	4564 if (srcbacklimit < isrc) {

	4565 srcbacklimit = isrc;

	4566 }

	4567 const uint8* ss = src - 1;

	4568 while (srcbacklimit <= ss) {

	4569 uint8 c = *ss--;

	4570 if ((c & ~0x02) == '<') {

	4571 // We found preceding < 3C or > 3E nearby

	4572 // Even cheaper: if inside a tag, we don't care what tag; return true

	4573 if (c == '<') {

	4574 return true;

	4575 }

	4576 // See if we are just after <title>...

	4577 if ((c == '>') && (isrc <= (ss - 5)) &&

	4578 (ss[-5] == '<') &&

	4579 ((ss[-4] \| 0x20) == 't') &&

	4580 ((ss[-3] \| 0x20) == 'i') &&

	4581 ((ss[-2] \| 0x20) == 't') &&

	4582 ((ss[-1] \| 0x20) == 'l') &&

	4583 ((ss[-0] \| 0x20) == 'e')) {

	4584 return true;

	4585 }

	4586 // See if we are just after <SCRIPT language=javascript>...

	4587 if ((c == '>') && (isrc <= (ss - 5)) &&

	4588 (ss[-5] == 's') &&

	4589 ((ss[-4] \| 0x20) == 'c') &&

	4590 ((ss[-3] \| 0x20) == 'r') &&

	4591 ((ss[-2] \| 0x20) == 'i') &&

	4592 ((ss[-1] \| 0x20) == 'p') &&

	4593 ((ss[-0] \| 0x20) == 't')) {

	4594 return true;

	4595 }

	4596 // Not in a tag

	4597 return false;

	4598 // See if we are just after JavaScript comment /* ...

	4599 } else if (c == '/') {

	4600 if (((ss + 2) < srclimit) && (ss[2] == '*')) {

	4601 // We backscanned to /*

	4602 return true;

	4603 }

	4604 }

	4605 }

	4606

	4607 return false;

	4608 }

	4609

	4610 const uint8* SkipToTagEnd(const uint8* isrc, const uint8* src, const uint8* srcl imit) {

	4611 const uint8* ss = src + 1;

	4612 while (ss <= srclimit) {

	4613 uint8 c = *ss++;

	4614 if ((c == '<') \|\| (c == '>')) {

	4615 return ss;

	4616 }

	4617 }

	4618 return src + 2; // Always make progress, Otherwise we get an infinite loop

	4619 }

	4620

	4621

	4622 // Take a watch string and map to a ranked encoding. If no match, return -1

	4623 int LookupWatchEnc(const string& watch_str) {

	4624 int watchval = -1;

	4625 // Mixed encoding maps to enc=UTF8UTF8

	4626 if (watch_str == "UTF8UTF8") {

	4627 watchval = F_UTF8UTF8;

	4628 } else {

	4629 Encoding enc;

	4630 if (EncodingFromName(watch_str.c_str(), &enc)) {

	4631 watchval = CompactEncDet::BackmapEncodingToRankedEncoding(enc);

	4632 }

	4633 }

	4634 return watchval;

	4635 }

	4636

	4637 // Return true if enc and enc2 are equal or one is a subset of the other

	4638 // or either is UNKNOWN

	4639 // also UTF8UTF8 is compatible with both Latin1 and UTF8

	4640 bool CompatibleEnc(Encoding enc, Encoding enc2) {

	4641 if (enc < 0) {return false;}

	4642 if (NUM_ENCODINGS <= enc) {return false;}

	4643 if (enc2 < 0) {return false;}

	4644 if (NUM_ENCODINGS <= enc2) {return false;}

	4645 if (enc == enc2) {return true;}

	4646 if (kMapEncToBaseEncoding[enc] == kMapEncToBaseEncoding[enc2]) {return true;}

	4647

	4648 if (enc == ASCII_7BIT) {return true;}

	4649 if (enc2 == ASCII_7BIT) {return true;}

	4650 if (enc == UNKNOWN_ENCODING) {return true;}

	4651 if (enc2 == UNKNOWN_ENCODING) {return true;}

	4652 if (enc == UTF8UTF8) {

	4653 if (enc2 == UTF8) {return true;}

	4654 if (kMapEncToBaseEncoding[enc2] == ISO_8859_1) {return true;}

	4655 }

	4656 if (enc2 == UTF8UTF8) {

	4657 if (enc == UTF8) {return true;}

	4658 if (kMapEncToBaseEncoding[enc] == ISO_8859_1) {return true;}

	4659 }

	4660

	4661 return false;

	4662 }

	4663

	4664 // Return superset of enc and enc2, which must be compatible

	4665 Encoding SupersetEnc(Encoding enc, Encoding enc2) {

	4666 //printf(" SupersetEnc (%s, ", MyEncodingName(enc)); // TEMP

	4667 //printf("%s) ", MyEncodingName(enc2));

	4668 //printf("= %s\n",

	4669 // MyEncodingName(kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2] ?

	4670 // enc :enc2));

	4671 if (kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2]) {

	4672 return enc;

	4673 }

	4674 return enc2;

	4675 }

	4676

	4677

	4678 // If unreliable, try rescoring to separate some encodings

	4679 Encoding Rescore(Encoding enc, const uint8* isrc,

	4680 const uint8* srctextlimit, DetectEncodingState* destatep) {

	4681 if (FLAGS_counts) {++rescore_used;}

	4682 Encoding new_enc = enc;

	4683

	4684 bool rescore_change = false;

	4685

	4686 int count = destatep->next_interesting_pair[OtherPair];

	4687 int text_length = srctextlimit - isrc;

	4688 for (int i = 0; i < count; ++i) {

	4689 int bigram_offset = destatep->interesting_offsets[OtherPair][i];

	4690 uint8 byte0 = (0 < bigram_offset) ?

	4691 isrc[bigram_offset - 1] : 0x20;

	4692 uint8 byte1 = isrc[bigram_offset + 0]; // Known to have high bit on

	4693 uint8 byte2 = ((bigram_offset + 1) < text_length) ?

	4694 isrc[bigram_offset + 1] : 0x20;

	4695 uint8 byte3 = ((bigram_offset + 2) < text_length) ?

	4696 isrc[bigram_offset + 2] : 0x20;

	4697 int high_hash = ((byte0 & 0xc0) >> 0) \|

	4698 ((byte1 & 0xc0) >> 1) \|

	4699 ((byte2 & 0xc0) >> 4) \|

	4700 ((byte3 & 0xc0) >> 6); // 00112233

	4701

	4702 // Boost HighAccent encodings for Ascii bit patterns

	4703 // 0x1x 0x0x

	4704 // 1010 1010

	4705 // 0010 0000

	4706 //

	4707 if ((high_hash & 0xaa) == 0x20) {

	4708 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	4709 int rankedencoding = destatep->rankedencoding_list[j];

	4710 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {

	4711 // TODO: also want to boost Shift-JIS here if byte1 is Ax..Dx

	4712 // TEMP

	4713 //printf(" Rescore[%02x] %s +%d\n",

	4714 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost) ;

	4715 Boost(destatep, rankedencoding, kGentlePairBoost);

	4716 rescore_change = true;

	4717 }

	4718 }

	4719 }

	4720

	4721 // Whack HighAccent encodings for high bit patterns

	4722 // 1x1x 1x1x

	4723 // 1010 1010

	4724 // 1010 1010

	4725 //

	4726 if ((high_hash & 0xaa) == 0xaa) {

	4727 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {

	4728 int rankedencoding = destatep->rankedencoding_list[j];

	4729 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {

	4730 // TEMP

	4731 //printf(" Rescore[%02x] %s -%d\n",

	4732 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost) ;

	4733 Whack(destatep, rankedencoding, kGentlePairBoost);

	4734 rescore_change = true;

	4735 }

	4736 }

	4737 }

	4738

	4739 }

	4740

	4741 if (rescore_change) {

	4742 ReRank(destatep);

	4743 new_enc = kMapToEncoding[destatep->top_rankedencoding];

	4744

	4745 if (destatep->debug_data != NULL) {

	4746 char buff[32];

	4747 snprintf(buff, sizeof(buff), "=Rescore %s", MyEncodingName(new_enc));

	4748 SetDetailsEncProb(destatep,

	4749 0,

	4750 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),

	4751 buff);

	4752 //// DumpDetail(destatep);

	4753 }

	4754

	4755 SimplePrune(destatep, kFinalPruneDifference);

	4756 CalcReliable(destatep);

	4757 }

	4758

	4759 //if (new_enc != enc) {

	4760 // // TEMP

	4761 // printf(" Rescore new top encoding = %s\n",

	4762 // MyRankedEncName(destatep->top_rankedencoding));

	4763 //}

	4764

	4765 return new_enc;

	4766 }

	4767

	4768

	4769 // Given an encoding, add its corresponding ranked encoding to the set

	4770 void AddToSet(Encoding enc, int* list_len, int* list) {

	4771 // TEMP print

	4772 int item = CompactEncDet::BackmapEncodingToRankedEncoding(enc);

	4773 for (int i = 0; i < *list_len; ++i) {

	4774 if (list[i] == item) {

	4775 return; // Already in the set; don't add again

	4776 }

	4777 }

	4778 list[(*list_len)++] = item;

	4779 }

	4780

	4781

	4782 static const int kMinRobustBigramCount = 1000;

	4783 static const int kMinKBToRobustScan = 64;

	4784 static const int kMaxKBToRobustScan = 256;

	4785

	4786 // Scan the first 64K or so, just doing raw bigram increments on given

	4787 // probability list.

	4788 // No fancy duplicate filtering or anything else here.

	4789 // Returns number of bigrams counted

	4790 int RobustScan(const char* text,

	4791 int text_length,

	4792 int robust_renc_list_len,

	4793 int* robust_renc_list,

	4794 int* robust_renc_probs) {

	4795 if (FLAGS_counts) {++robust_used;}

	4796 // Zero all the result probabilities

	4797 for (int i = 0; i < robust_renc_list_len; ++i) {

	4798 robust_renc_probs[i] = 0;

	4799 }

	4800 int max_fast_len = minint(text_length, (kMaxKBToRobustScan << 10));

	4801 const uint8* isrc = reinterpret_cast<const uint8*>(text);

	4802 const uint8* src = isrc;

	4803 const uint8* srclimitfast2 = isrc + max_fast_len - 1;

	4804 const uint8* srclimitfast4 = isrc + max_fast_len - 3;

	4805

	4806 int min_fast_len = minint(text_length, (kMinKBToRobustScan << 10));

	4807 const uint8* srclimitmin = isrc + min_fast_len - 1;

	4808

	4809 int bigram_count = 0;

	4810

	4811 if (FLAGS_enc_detect_source) {

	4812 PsSourceInit(kPsSourceWidth);

	4813 fprintf(stderr, "(RobustScan) do-src\n");

	4814 }

	4815

	4816 // Sum over a big chunk of the input

	4817 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec

	4818 //====================================

	4819 while (src < srclimitfast2) {

	4820 // Skip to next interesting bigram

	4821 while (src < srclimitfast4) {

	4822 uint32 u32 = reinterpret_cast<const uint32>(src);

	4823 src+= 4;

	4824 if ((u32 & 0x80808080) != 0) {src -= 4; break;}

	4825 }

	4826 while (src < srclimitfast2) {

	4827 uint8 uc = *src++;

	4828 if (static_cast<signed char>(uc) < 0) {src--; break;}

	4829 }

	4830

	4831 if (src < srclimitfast2) {

	4832 // We found a bigram with high bit on

	4833 // Next 5 lines commented out so we don't show all the source.

	4834 //const uint8* srctextlimit = isrc + text_length;

	4835 //if (FLAGS_enc_detect_source) {

	4836 // PsSource(src, isrc, srctextlimit);

	4837 // PsMark(src, 2, isrc, 0);

	4838 //}

	4839

	4840 uint8 byte1 = src[0];

	4841 uint8 byte2 = src[1];

	4842 uint8 byte1x2x = (byte1 & 0xf0) \| ((byte2 >> 4) & 0x0f);

	4843 uint8 byte1f = byte1;

	4844 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebre w)

	4845 byte1f ^= (byte2 & 0x80);

	4846

	4847 // The real increments

	4848 for (int j = 0; j < robust_renc_list_len; ++j) {

	4849 int rankedencoding = robust_renc_list[j];

	4850 const UnigramEntry* ue = &unigram_table[rankedencoding];

	4851 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];

	4852 if ((ue->b12[byte1x2x] & 0x01) != 0) {

	4853 // Use a more-precise table

	4854 int byte32x32 = ((byte1 & 0x1f) << 5) \| (byte2 & 0x1f);

	4855 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2

	4856 DCHECK(ue->hires[hiressub] != NULL);

	4857 incr += ue->hires[hiressub][byte32x32];

	4858 } else {

	4859 // Default final offset

	4860 incr += ue->so;

	4861 }

	4862 robust_renc_probs[j] += incr;

	4863 }

	4864

	4865 src += 2; // Continue after this bigram

	4866 ++bigram_count;

	4867

	4868 // Stop after 1000 bigrams reached, if at least 64KB scanned

	4869 if ((bigram_count > kMinRobustBigramCount) && (src > srclimitmin)) {

	4870 break;

	4871 }

	4872 }

	4873 }

	4874

	4875 if (FLAGS_enc_detect_source) {

	4876 fprintf(stderr, "( bigram_count = %d) do-src\n", bigram_count);

	4877 if (bigram_count == 0) {bigram_count = 1;} // zdiv

	4878 for (int i = 0; i < robust_renc_list_len; ++i) {

	4879 fprintf(stderr, "( enc[%-12.12s] = %7d (avg %d)) do-src\n",

	4880 MyRankedEncName(robust_renc_list[i]), robust_renc_probs[i],

	4881 robust_renc_probs[i] / bigram_count);

	4882 }

	4883 PsSourceFinish();

	4884 }

	4885

	4886 return bigram_count;

	4887 }

	4888

	4889 // If unreliable, rescan middle of document to see if we can get a better

	4890 // answer. Rescan is only worthwhile if there are ~200 bytes or more left,

	4891 // since the detector takes as much as 96 bytes of bigrams to decide.

	4892 Encoding Rescan(Encoding enc,

	4893 const uint8* isrc,

	4894 const uint8* src,

	4895 const uint8* srctextlimit,

	4896 const char* url_hint,

	4897 const char* http_charset_hint,

	4898 const char* meta_charset_hint,

	4899 const int encoding_hint,

	4900 const Language language_hint,

	4901 const CompactEncDet::TextCorpusType corpus_type,

	4902 bool ignore_7bit_mail_encodings,

	4903 DetectEncodingState* destatep) {

	4904 bool enc_is_reliable = destatep->reliable;

	4905 Encoding new_enc = enc;

	4906 Encoding second_best_enc =

	4907 kMapToEncoding[destatep->second_top_rankedencoding];

	4908

	4909 if (FLAGS_counts) {++rescan_used;}

	4910

	4911 int scanned_bytes = src - isrc;

	4912 int unscanned_bytes = srctextlimit - src;

	4913 int text_length = srctextlimit - isrc;

	4914 bool empty_rescan = true;

	4915

	4916 // See if enough bytes left to bother doing rescan

	4917 if (kMinRescanLength < unscanned_bytes) {

	4918 const char* text = reinterpret_cast<const char*>(isrc);

	4919

	4920 Encoding one_hint = destatep->http_hint;

	4921 if ((one_hint == UNKNOWN_ENCODING) &&

	4922 (destatep->meta_hint != UNKNOWN_ENCODING)) {

	4923 one_hint = destatep->meta_hint;

	4924 }

	4925 if ((one_hint == UNKNOWN_ENCODING) &&

	4926 (destatep->bom_hint != UNKNOWN_ENCODING)) {

	4927 one_hint = destatep->bom_hint;

	4928 }

	4929

	4930 // Go to an even offset to keep UTF-16 in synch

	4931 int middle_offset = (scanned_bytes + (unscanned_bytes / 2)) & ~1;

	4932 CHECK(middle_offset <= text_length);

	4933

	4934 // Look back a bit for a low byte to synchronize, else hope for the best.

	4935 const uint8* srcbacklimit = isrc + middle_offset - kMaxScanBack;

	4936 if (srcbacklimit < src) {

	4937 srcbacklimit = src;

	4938 }

	4939 const uint8* ss = isrc + middle_offset - 1;

	4940 while (srcbacklimit <= ss) {

	4941 if ((*ss & 0x80) == 0) {break;}

	4942 --ss;

	4943 }

	4944 // Leave middle offset unchanged unless we found a low byte

	4945 if (srcbacklimit <= ss) {

	4946 // Align to low byte or high byte just after it, whichever is even

	4947 middle_offset = (ss - isrc + 1) & ~1; // Even to keep UTF-16 in sync

	4948 }

	4949 CHECK(middle_offset <= text_length);

	4950

	4951 if (destatep->debug_data != NULL) {

	4952 SetDetailsEncLabel(destatep, ">> Rescan");

	4953 // Print the current chart before recursive call

	4954 DumpDetail(destatep);

	4955

	4956 char buff[32];

	4957 snprintf(buff, sizeof(buff), ">> Rescan[%d..%d]",

	4958 middle_offset, text_length);

	4959 PsRecurse(buff);

	4960 }

	4961

	4962 int mid_bytes_consumed;

	4963 bool mid_is_reliable;

	4964 Encoding mid_second_best_enc;

	4965 CEDInternalFlags newflags = static_cast<CEDInternalFlags>(

	4966 kCEDRescanning + kCEDForceTags);

	4967 // Recursive call for rescan of half of remaining

	4968 Encoding mid_enc = InternalDetectEncoding(

	4969 newflags,

	4970 text + middle_offset,

	4971 text_length - middle_offset,

	4972 url_hint,

	4973 http_charset_hint,

	4974 meta_charset_hint,

	4975 encoding_hint,

	4976 language_hint, // User interface lang

	4977 corpus_type,

	4978 ignore_7bit_mail_encodings,

	4979 &mid_bytes_consumed,

	4980 &mid_is_reliable,

	4981 &mid_second_best_enc);

	4982 destatep->reliable = mid_is_reliable;

	4983

	4984 empty_rescan = (mid_enc == ASCII_7BIT);

	4985

	4986 // Not the right decision if, e.g. enc=Greek, mid=ASCII7, one=KSC

	4987 // hence the !empty_rescan term

	4988 if (!empty_rescan && CompatibleEnc(one_hint, mid_enc)) {

	4989 // Encoding we just found is compatible with the

	4990 // single hint (if any); return superset

	4991 new_enc = SupersetEnc(one_hint, mid_enc);

	4992 }

	4993

	4994 // If original and mid are compatible, and both reliable,

	4995 // return new_enc = SupersetEnc(enc, mid_enc)

	4996 //

	4997 // This avoids too much weight on a bogus hint causing a RobustScan

	4998 // that gets the wrong answer

	4999 if (!empty_rescan && mid_is_reliable && enc_is_reliable &&

	5000 CompatibleEnc(enc, mid_enc)) {

	5001 new_enc = SupersetEnc(enc, mid_enc);

	5002 return new_enc;

	5003 }

	5004

	5005 // if mid unreliable, robustscan

	5006 // if mid empty, robustscan

	5007 // if original and mid not compatible, robustscan

	5008 // if mid and one_hint not compatible, robustscan

	5009

	5010 // If we found conflicting data, drop back and do a robust scan of a big

	5011 // chunk of the input over a set of candidate encodings

	5012 //

	5013 if (!mid_is_reliable \|\|

	5014 empty_rescan \|\|

	5015 !CompatibleEnc(enc, mid_enc) \|\|

	5016 !CompatibleEnc(one_hint, mid_enc)) {

	5017 int robust_renc_list_len; // Number of active encodings

	5018 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings

	5019 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs

	5020

	5021 robust_renc_list_len = 0;

	5022 AddToSet(enc, &robust_renc_list_len, robust_renc_list);

	5023 AddToSet(second_best_enc, &robust_renc_list_len, robust_renc_list);

	5024 AddToSet(mid_enc, &robust_renc_list_len, robust_renc_list);

	5025 AddToSet(mid_second_best_enc, &robust_renc_list_len, robust_renc_list);

	5026 if (destatep->http_hint != UNKNOWN_ENCODING) {

	5027 AddToSet(destatep->http_hint, &robust_renc_list_len, robust_renc_list);

	5028 }

	5029 if (destatep->meta_hint != UNKNOWN_ENCODING) {

	5030 AddToSet(destatep->meta_hint, &robust_renc_list_len, robust_renc_list);

	5031 }

	5032 if (destatep->bom_hint != UNKNOWN_ENCODING) {

	5033 AddToSet(destatep->bom_hint, &robust_renc_list_len, robust_renc_list);

	5034 }

	5035 if (destatep->tld_hint != UNKNOWN_ENCODING) {

	5036 AddToSet(destatep->tld_hint, &robust_renc_list_len, robust_renc_list);

	5037 }

	5038

	5039 // Separate simple scan

	5040 // =====================

	5041 if (destatep->debug_data != NULL) {

	5042 SetDetailsEncLabel(destatep, ">> RobustScan");

	5043 // Print the current chart before recursive call

	5044 DumpDetail(destatep);

	5045

	5046 char buff[32];

	5047 snprintf(buff, sizeof(buff), ">> RobustScan[0..%d]", text_length);

	5048 PsRecurse(buff);

	5049 }

	5050

	5051 int bigram_count = RobustScan(text, text_length,

	5052 robust_renc_list_len, robust_renc_list, robust_renc_probs);

	5053

	5054 // Default to new_enc and update if something better was found

	5055 int best_prob = -1;

	5056 // TEMP print

	5057 for (int i = 0; i < robust_renc_list_len; ++i) {

	5058 if (best_prob < robust_renc_probs[i]) {

	5059 best_prob = robust_renc_probs[i];

	5060 new_enc = kMapToEncoding[robust_renc_list[i]];

	5061 }

	5062 }

	5063

	5064 if (destatep->debug_data != NULL) {

	5065 char buff[32];

	5066 snprintf(buff, sizeof(buff), "=Robust[%d] %s",

	5067 bigram_count, MyEncodingName(new_enc));

	5068 SetDetailsEncProb(destatep,

	5069 0,

	5070 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc ),

	5071 buff);

	5072 }

	5073 }

	5074 } // End if enough bytes

	5075

	5076 return new_enc;

	5077 }

	5078

	5079 // With no hints at all, and perhaps on rescan, we relax our pickiness

	5080 // and go ahead and accept the top multibyte encodings, even though

	5081 // strictly their web pages should have declared an explicit encoding to

	5082 // avoid the HTML standard's default ISO-8859-1.

	5083 bool NoHintsCloseEnoughCompatible(Encoding top_enc) {

	5084 // First test accepts degenerate cases plus UTF8 and UTF8UTF8

	5085 if (CompatibleEnc(UTF8, top_enc)) {return true;}

	5086

	5087 // The rest look for exact match of base encoding

	5088 Encoding base_enc = kMapEncToBaseEncoding[top_enc];

	5089 if (base_enc == JAPANESE_EUC_JP) {return true;}

	5090 if (base_enc == JAPANESE_SHIFT_JIS) {return true;}

	5091 if (base_enc == CHINESE_BIG5) {return true;}

	5092 if (base_enc == CHINESE_GB) {return true;}

	5093 if (base_enc == KOREAN_EUC_KR) {return true;}

	5094 return false;

	5095 }

	5096

	5097

	5098

	5099 // Scan raw bytes and detect most likely encoding

	5100 // Design goals:

	5101 // Skip over big initial stretches of seven-bit ASCII bytes very quickly

	5102 // Thread safe

	5103 // Works equally well on

	5104 // 50-byte queries,

	5105 // 5000-byte email and

	5106 // 50000-byte web pages

	5107 // Length 0 input returns ISO_8859_1 (ASCII) encoding

	5108 // Setting ignore_7bit_mail_encodings effectively turns off detection of

	5109 // UTF-7, HZ, and ISO-2022-xx

	5110 Encoding InternalDetectEncoding(

	5111 CEDInternalFlags flags, const char* text, int text_length,

	5112 const char* url_hint, const char* http_charset_hint,

	5113 const char* meta_charset_hint, const int encoding_hint,

	5114 const Language language_hint, // User interface lang

	5115 const CompactEncDet::TextCorpusType corpus_type,

	5116 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,

	5117 Encoding* second_best_enc) {

	5118 *bytes_consumed = 0;

	5119 *is_reliable = false;

	5120 *second_best_enc = ASCII_7BIT;

	5121

	5122 if (text_length == 0) {

	5123 // Follow the spec. Text might be NULL.

	5124 *is_reliable = true;

	5125 return ISO_8859_1;

	5126 }

	5127

	5128 // For very short (20-50 byte) input strings that are highly likely to be

	5129 // all printable ASCII, our startup overhead might dominate. We have to do the

	5130 // full detection if the ISO-2022-xx, HZ, or UTF-7 encodings are possible.

	5131 // Otherwise, we can do a quick scan for printable ASCII.

	5132 if ((text_length <= 500) && ignore_7bit_mail_encodings &&

	5133 QuickPrintableAsciiScan(text, text_length)) {

	5134 *is_reliable = true;

	5135 return ASCII_7BIT;

	5136 }

	5137

	5138 // Go for the full boat detection

	5139 DetectEncodingState destate;

	5140 InitDetectEncodingState(&destate);

	5141

	5142 std::unique_ptr<DetailEntry[]> scoped_debug_data;

	5143 if (FLAGS_enc_detect_detail) {

	5144 // Allocate max 10 details per bigram

	5145 scoped_debug_data.reset(new DetailEntry[kMaxPairs * 10]);

	5146 destate.debug_data = scoped_debug_data.get();

	5147 // NOTE: destate and scoped_debug_data have exactly the same scope

	5148 // All other FLAGS_enc_detect_detail tests use destate.debug_data != NULL

	5149 }

	5150

	5151 // Get text length limits

	5152 // Typically, we scan the first 16KB looking for all encodings, then

	5153 // scan the rest (up to 256KB) a bit faster by no longer looking for

	5154 // interesting bytes below 0x80. This allows us to skip over runs of

	5155 // 7-bit-ASCII much more quickly.

	5156 int slow_len = minint(text_length, (FLAGS_enc_detect_slow_max_kb << 10));

	5157 int fast_len = minint(text_length, (FLAGS_enc_detect_fast_max_kb << 10));

	5158

	5159 // Initialize pointers.

	5160 // In general, we do not look at last 3 bytes of input in the fast scan

	5161 // We do, however want to look at the last byte or so in the slow scan,

	5162 // especilly in the case of a very short text whose only interesting

	5163 // information is a 3-byte UTF-8 character in the last three bytes.

	5164 // If necessary, we fake a last bigram with 0x20 space as a pad byte.

	5165 const uint8* isrc = reinterpret_cast<const uint8*>(text);

	5166 const uint8* src = isrc;

	5167 const uint8* srctextlimit = isrc + text_length;

	5168 const uint8* srclimitslow2 = isrc + slow_len - 1;

	5169 const uint8* srclimitfast2 = isrc + fast_len - 1;

	5170 const uint8* srclimitfast4 = isrc + fast_len - 3;

	5171 if (srclimitslow2 > srclimitfast2) {

	5172 srclimitslow2 = srclimitfast2;

	5173 }

	5174 destate.initial_src = isrc;

	5175 destate.limit_src = srclimitfast2 + 1; // May include last byte

	5176 destate.prior_src = isrc;

	5177 destate.last_pair = isrc - 2;

	5178

	5179 const char* scan_table = kTestPrintableAsciiTildePlus;

	5180 if (ignore_7bit_mail_encodings) {

	5181 // Caller wants to ignore UTF-7, HZ, ISO-2022-xx

	5182 // Don't stop on + (for UTF-7), nor on ~ (for HZ)

	5183 scan_table = kTestPrintableAscii;

	5184 }

	5185 int exit_reason = 0;

	5186

	5187 if (destate.debug_data != NULL) {

	5188 BeginDetail(&destate);

	5189 // Take any incoming watch encoding name and backmap to the corresponding

	5190 // ranked enum value

	5191 watch1_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch1);

	5192 if (watch1_rankedenc >= 0) {

	5193 fprintf(stderr, "/track-me %d def\n", watch1_rankedenc);

	5194 }

	5195

	5196 watch2_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch2);

	5197 if (watch2_rankedenc >= 0) {

	5198 fprintf(stderr, "/track-me2 %d def\n", watch2_rankedenc);

	5199 }

	5200

	5201 fprintf(stderr, "%% kDerateHintsBelow = %d\n", kDerateHintsBelow);

	5202 }

	5203 if (FLAGS_enc_detect_source) {

	5204 PsSourceInit(kPsSourceWidth);

	5205 PsSource(src, isrc, srctextlimit);

	5206 PsMark(src, 4, isrc, 0);

	5207 }

	5208

	5209 // Apply hints, if any, to probabilities

	5210 // NOTE: Encoding probabilites are all zero at this point

	5211 ApplyHints(url_hint,

	5212 http_charset_hint,

	5213 meta_charset_hint,

	5214 encoding_hint,

	5215 language_hint,

	5216 corpus_type,

	5217 &destate);

	5218

	5219 // NOTE: probabilities up to this point are subject to derating for

	5220 // small numbers of bigrams.

	5221 // Probability changes after this point are not derated.

	5222

	5223 // Do first 4 bytes to pick off strong markers

	5224 InitialBytesBoost(isrc, text_length, &destate);

	5225

	5226 bool ignored_some_tag_text = false;

	5227 int tag_text_bigram_count = 0;

	5228

	5229 // Slower loop, approx 500 MB/sec (2.8 GHz P4)

	5230 // ASSERT(srclimitslow2 <= srclimitfast2);

	5231 //====================================

	5232 DoMoreSlowLoop:

	5233 while (src < srclimitslow2) {

	5234 // Skip to next interesting byte (this is the slower part)

	5235 while (src < srclimitslow2) {

	5236 uint8 uc = *src++;

	5237 if (scan_table[uc] != 0) {exit_reason = scan_table[uc]; src--; break;}

	5238 }

	5239

	5240 if (src < srclimitslow2) {

	5241 if (FLAGS_enc_detect_source) {

	5242 PsSource(src, isrc, srctextlimit); // don't mark yet

	5243 }

	5244

	5245 int weightshift = 0;

	5246 // In the first 16KB, derate new text run inside <title>...</title> and

	5247 // inside <!-- ... -->

	5248 if (////((destate.last_pair + 6) <= src) && // if beyond last one

	5249 ////(tag_text_bigram_count < kMaxBigramsTagTitleText) &&

	5250 (corpus_type == CompactEncDet::WEB_CORPUS) && // and web page

	5251 !CEDFlagForceTags(flags)) { // and OK to skip

	5252 ////if (TextInsideTag(destate.last_pair + 2, src, srclimitslow2)) {

	5253 if (TextInsideTag(isrc, src, srclimitslow2)) {

	5254 if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {

	5255 ignored_some_tag_text = true;

	5256 src = SkipToTagEnd(destate.last_pair + 2, src, srclimitslow2);

	5257 continue;

	5258 } else {

	5259 weightshift = kWeightshiftForTagTitleText;

	5260 ++tag_text_bigram_count;

	5261 }

	5262 }

	5263 }

	5264 if (FLAGS_enc_detect_source) {

	5265 PsMark(src, 2, isrc, weightshift);

	5266 }

	5267 // Saves byte pair and offset

	5268 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,

	5269 &destate, weightshift, exit_reason);

	5270 // Advance; if inside tag, advance to end of tag

	5271 if (weightshift == 0) {

	5272 src += exit_reason; // 1 Ascii, 2 other

	5273 } else {

	5274 src += exit_reason; // 1 Ascii, 2 other

	5275 //// src = SkipToTagEnd(destate.last_pair, src, srclimitslow2);

	5276 }

	5277

	5278 if (pruned) {

	5279 // Scoring and active encodings have been updated

	5280 if (destate.done) {break;}

	5281 // Check if all the reasons for the slow loop have been pruned

	5282 // If so, go to fast loop

	5283 if (!SevenBitActive(&destate)) {break;}

	5284 }

	5285 }

	5286 }

	5287 //====================================

	5288

	5289 // We reached the end of a slow scan, possibly because no more SevenBitActive,

	5290 // or possibly are at end of source.

	5291 // If we are exactly at the end of the source, make sure we look at the very

	5292 // last byte.

	5293 bool very_last_byte_incremented = false;

	5294 if (src == (srctextlimit - 1)) {

	5295 exit_reason = scan_table[*src];

	5296 if (exit_reason != 0) {

	5297 // The very last byte is an interesting byte

	5298 // Saves byte pair and offset

	5299 //printf("Interesting very last slow byte = 0x%02x\n", *src);

	5300 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);

	5301 very_last_byte_incremented = true;

	5302 }

	5303 }

	5304

	5305 if (FLAGS_enc_detect_source) {

	5306 PsSource(src, isrc, srctextlimit);

	5307 PsMark(src, 2, isrc, 0);

	5308 }

	5309 // Force a pruning based on whatever we have

	5310 // Delete the seven-bit encodings if there is no evidence of them so far

	5311 BoostPrune(src, &destate, PRUNE_SLOWEND);

	5312

	5313 if (!destate.done) {

	5314 // If not clear yet on 7-bit-encodings and more bytes, do more slow

	5315 if (SevenBitActive(&destate) && (src < srclimitfast2)) {

	5316 // Increment limit by another xxxK

	5317 slow_len += (FLAGS_enc_detect_slow_max_kb << 10);

	5318 srclimitslow2 = isrc + slow_len - 1;

	5319 if (srclimitslow2 > srclimitfast2) {

	5320 srclimitslow2 = srclimitfast2;

	5321 }

	5322 if (!UTF7OrHzActive(&destate)) {

	5323 // We can switch to table that does not stop on + ~

	5324 scan_table = kTestPrintableAscii;

	5325 }

	5326 goto DoMoreSlowLoop;

	5327 }

	5328

	5329

	5330 exit_reason = 2;

	5331 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec

	5332 //====================================

	5333 while (src < srclimitfast2) {

	5334 // Skip to next interesting byte (this is the faster part)

	5335 while (src < srclimitfast4) {

	5336 uint32 u32 = reinterpret_cast<const uint32>(src);

	5337 src+= 4;

	5338 if ((u32 & 0x80808080) != 0) {src -= 4; break;}

	5339 }

	5340 while (src < srclimitfast2) {

	5341 uint8 uc = *src++;

	5342 if (static_cast<signed char>(uc) < 0) {src--; break;}

	5343 }

	5344

	5345 if (src < srclimitfast2) {

	5346 if (FLAGS_enc_detect_source) {

	5347 PsSource(src, isrc, srctextlimit);

	5348 PsMark(src, 2, isrc, 0);

	5349 }

	5350 // saves byte pair and offset

	5351 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,

	5352 &destate, 0, exit_reason);

	5353 src += exit_reason; // 1 Ascii, 2 other

	5354 if (pruned) {

	5355 // Scoring and active encodings have been updated

	5356 if (destate.done) {break;}

	5357 }

	5358 }

	5359 }

	5360 //====================================

	5361 // We reached the end of fast scan

	5362

	5363 // If we are exactly at the end of the source, make sure we look at the very

	5364 // last byte.

	5365 if (src == (srctextlimit - 1) && !very_last_byte_incremented) {

	5366 exit_reason = scan_table[*src];

	5367 if (exit_reason != 0) {

	5368 // The very last byte is an interesting byte

	5369 // Saves byte pair and offset

	5370 //printf("Interesting very last fast byte = 0x%02x\n", *src);

	5371 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason );

	5372 very_last_byte_incremented = true;

	5373 }

	5374 }

	5375

	5376 } // End if !done

	5377

	5378 if (FLAGS_enc_detect_source) {

	5379 PsSource(src, isrc, srctextlimit);

	5380 PsMark(src, 2, isrc, 0);

	5381 }

	5382 // Force a pruning based on whatever we have

	5383 BoostPrune(src, &destate, PRUNE_FINAL);

	5384

	5385 if (FLAGS_enc_detect_summary) {

	5386 DumpSummary(&destate, AsciiPair, 32);

	5387 DumpSummary(&destate, OtherPair, 32);

	5388 }

	5389 if (FLAGS_enc_detect_source) {

	5390 PsSourceFinish();

	5391 }

	5392 if (destate.debug_data != NULL) {

	5393 //// DumpDetail(&destate);

	5394 }

	5395

	5396

	5397 if (ignored_some_tag_text &&

	5398 (kMapToEncoding[destate.top_rankedencoding] == ASCII_7BIT)) {

	5399 // There were some interesting bytes, but only in tag text.

	5400 // Recursive call to reprocess looking at the tags this time.

	5401

	5402 if (destate.debug_data != NULL) {

	5403 SetDetailsEncLabel(&destate, ">> Recurse/tags");

	5404 // Print the current chart before recursive call

	5405 DumpDetail(&destate);

	5406

	5407 char buff[32];

	5408 snprintf(buff, sizeof(buff), ">> Recurse for tags");

	5409 PsRecurse(buff);

	5410 }

	5411

	5412 // Recursive call for high bytes in tags [no longer used, 1/16 tag score]

	5413 Encoding enc2 = InternalDetectEncoding(

	5414 kCEDForceTags, // force

	5415 text,

	5416 text_length,

	5417 url_hint,

	5418 http_charset_hint,

	5419 meta_charset_hint,

	5420 encoding_hint,

	5421 language_hint,

	5422 corpus_type,

	5423 ignore_7bit_mail_encodings,

	5424 bytes_consumed,

	5425 is_reliable,

	5426 second_best_enc);

	5427

	5428 if (destate.debug_data != NULL) {

	5429 // Show winning encoding and dump PostScript

	5430 char buff[32];

	5431 snprintf(buff, sizeof(buff), "=2 %s", MyEncodingName(enc2));

	5432 SetDetailsEncProb(&destate,

	5433 0,

	5434 CompactEncDet::BackmapEncodingToRankedEncoding(enc2),

	5435 buff);

	5436 DumpDetail(&destate);

	5437 }

	5438

	5439 return enc2;

	5440 }

	5441

	5442

	5443 // If the detected encoding does not match default/hints, or if the hints

	5444 // conflict with each other, mark as unreliable. This can be used to trigger

	5445 // further scoring.

	5446 // Three buckets of input documents;

	5447 // ~19% of the web no hints, and top == 7bit, Latin1, or CP1252

	5448 // ~79% of the web one or more hints, all same encoding X and top == X

	5449 // ~ 2% of the web one or more hints that are inconsistent

	5450

	5451 Encoding top_enc = kMapToEncoding[destate.top_rankedencoding];

	5452 Encoding one_hint = destate.http_hint;

	5453 if ((one_hint == UNKNOWN_ENCODING) &&

	5454 (destate.meta_hint != UNKNOWN_ENCODING)) {

	5455 one_hint = destate.meta_hint;

	5456 }

	5457 if ((one_hint == UNKNOWN_ENCODING) &&

	5458 (destate.bom_hint != UNKNOWN_ENCODING)) {

	5459 one_hint = destate.bom_hint;

	5460 }

	5461

	5462 bool found_compatible_encoding = true;

	5463 if (one_hint == UNKNOWN_ENCODING) {

	5464 // [~14% of the web] No hints, and top == 7bit, Latin1, or CP1252

	5465 if (!CompatibleEnc(ISO_8859_1, top_enc)) {

	5466 found_compatible_encoding = false;

	5467 // If there is nothing but a TLD hint and its top encoding matches, OK

	5468 if ((destate.tld_hint != UNKNOWN_ENCODING) &&

	5469 CompatibleEnc(destate.tld_hint, top_enc)) {

	5470 found_compatible_encoding = true;

	5471 }

	5472 }

	5473 } else if (CompatibleEnc(one_hint, destate.http_hint) &&

	5474 CompatibleEnc(one_hint, destate.meta_hint) &&

	5475 CompatibleEnc(one_hint, destate.bom_hint)) {

	5476 // [~83% of the web] One or more hints, all same encoding X and top == X

	5477 if (!CompatibleEnc(one_hint, top_enc)) {

	5478 // [~ 2% of the web] Oops, not the declared encoding

	5479 found_compatible_encoding = false;

	5480 }

	5481 } else {

	5482 // [~ 3% of the web] Two or more hints that are inconsistent

	5483 one_hint = UNKNOWN_ENCODING;

	5484 found_compatible_encoding = false;

	5485 }

	5486

	5487 // If we turned Latin1 into Latin2 or 7 via trigrams, don't fail it here

	5488 if (destate.do_latin_trigrams) {

	5489 if (CompatibleEnc(kMapToEncoding[F_Latin1], top_enc) \|\|

	5490 CompatibleEnc(kMapToEncoding[F_Latin2], top_enc) \|\|

	5491 CompatibleEnc(kMapToEncoding[F_CP1250], top_enc) \|\|

	5492 CompatibleEnc(kMapToEncoding[F_ISO_8859_13], top_enc)) {

	5493 found_compatible_encoding = true;

	5494 destate.reliable = true;

	5495 }

	5496 }

	5497

	5498 // If top encoding is not compatible with the hints, but it is reliably

	5499 // UTF-8, accept it anyway.

	5500 // This will perform badly with mixed UTF-8 prefix plus another encoding in

	5501 // the body if done too early, so we want to be rescanning.

	5502 if (!found_compatible_encoding &&

	5503 destate.reliable &&

	5504 NoHintsCloseEnoughCompatible(top_enc) &&

	5505 (destate.next_interesting_pair[OtherPair] >= kStrongPairs) &&

	5506 CEDFlagRescanning(flags)) {

	5507 found_compatible_encoding = true;

	5508 }

	5509

	5510 // Hold off on this so Rescan() can see if the original encoding was reliable

	5511 //if (!found_compatible_encoding) {

	5512 // destate.reliable = false;

	5513 //}

	5514

	5515 // If unreliable, try rescoring to separate some encodings

	5516 if (!destate.reliable \|\| !found_compatible_encoding) {

	5517 top_enc = Rescore(top_enc, isrc, srctextlimit, &destate);

	5518 }

	5519

	5520 *second_best_enc = kMapToEncoding[destate.second_top_rankedencoding];

	5521

	5522 // If unreliable, and not already rescanning,

	5523 // rescan middle of document to see if we can get a better

	5524 // answer. Rescan is only worthwhile if there are ~200 bytes or more left,

	5525 // since the detector takes as much as 96 bytes of bigrams to decide.

	5526 //

	5527 // CANNOT retry ISO-2022-xx HZ etc. because no declaration escape at the front

	5528 // or we may land in the middle of some partial state. Skip them all.

	5529 //

	5530 if ((!destate.reliable \|\| !found_compatible_encoding) &&

	5531 !CEDFlagRescanning(flags) &&

	5532 !SevenBitEncoding(top_enc)) {

	5533 top_enc = Rescan(top_enc,

	5534 isrc,

	5535 src,

	5536 srctextlimit,

	5537 url_hint,

	5538 http_charset_hint,

	5539 meta_charset_hint,

	5540 encoding_hint,

	5541 language_hint,

	5542 corpus_type,

	5543 ignore_7bit_mail_encodings,

	5544 &destate);

	5545 } else {

	5546 if (!found_compatible_encoding) {

	5547 destate.reliable = false;

	5548 }

	5549 }

	5550

	5551 if (destate.debug_data != NULL) {

	5552 // Dump PostScript

	5553 DumpDetail(&destate);

	5554 }

	5555

	5556 *bytes_consumed = src - isrc + 1; // We looked 1 byte beyond src

	5557 *is_reliable = destate.reliable;

	5558 return top_enc;

	5559 }

	5560

	5561 Encoding CompactEncDet::DetectEncoding(

	5562 const char* text, int text_length, const char* url_hint,

	5563 const char* http_charset_hint, const char* meta_charset_hint,

	5564 const int encoding_hint,

	5565 const Language language_hint, // User interface lang

	5566 const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,

	5567 int* bytes_consumed, bool* is_reliable) {

	5568 if (FLAGS_ced_echo_input) {

	5569 string temp(text, text_length);

	5570 fprintf(stderr, "CompactEncDet::DetectEncoding()\n%s\n\n", temp.c_str());

	5571 }

	5572

	5573 if (FLAGS_counts) {

	5574 encdet_used = 0;

	5575 rescore_used = 0;

	5576 rescan_used = 0;

	5577 robust_used = 0;

	5578 looking_used = 0;

	5579 doing_used = 0;

	5580 ++encdet_used;

	5581 }

	5582 if (FLAGS_dirtsimple) {

	5583 // Just count first 64KB bigram encoding probabilities for each encoding

	5584 int robust_renc_list_len; // Number of active encodings

	5585 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings

	5586 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs

	5587

	5588 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {

	5589 robust_renc_list[i] = i;

	5590 }

	5591 robust_renc_list_len = NUM_RANKEDENCODING;

	5592

	5593 RobustScan(text, text_length,

	5594 robust_renc_list_len, robust_renc_list, robust_renc_probs);

	5595

	5596 // Pick off best encoding

	5597 int best_prob = -1;

	5598 Encoding enc = UNKNOWN_ENCODING;

	5599 for (int i = 0; i < robust_renc_list_len; ++i) {

	5600 if (best_prob < robust_renc_probs[i]) {

	5601 best_prob = robust_renc_probs[i];

	5602 enc = kMapToEncoding[robust_renc_list[i]];

	5603 }

	5604 }

	5605

	5606 *bytes_consumed = minint(text_length, (kMaxKBToRobustScan << 10));

	5607 *is_reliable = true;

	5608 if (FLAGS_counts) {

	5609 printf("CEDcounts ");

	5610 while (encdet_used--) {printf("encdet ");}

	5611 while (rescore_used--) {printf("rescore ");}

	5612 while (rescan_used--) {printf("rescan ");}

	5613 while (robust_used--) {printf("robust ");}

	5614 while (looking_used--) {printf("looking ");}

	5615 while (doing_used--) {printf("doing ");}

	5616 printf("\n");

	5617 }

	5618

	5619 return enc;

	5620 }

	5621

	5622 Encoding second_best_enc;

	5623 Encoding enc = InternalDetectEncoding(kCEDNone,

	5624 text,

	5625 text_length,

	5626 url_hint,

	5627 http_charset_hint,

	5628 meta_charset_hint,

	5629 encoding_hint,

	5630 language_hint, // User interface lang

	5631 corpus_type,

	5632 ignore_7bit_mail_encodings,

	5633 bytes_consumed,

	5634 is_reliable,

	5635 &second_best_enc);

	5636 if (FLAGS_counts) {

	5637 printf("CEDcounts ");

	5638 while (encdet_used--) {printf("encdet ");}

	5639 while (rescore_used--) {printf("rescore ");}

	5640 while (rescan_used--) {printf("rescan ");}

	5641 while (robust_used--) {printf("robust ");}

	5642 while (looking_used--) {printf("looking ");}

	5643 while (doing_used--) {printf("doing ");}

	5644 printf("\n");

	5645 }

	5646 return enc;

	5647 }

	5648

	5649

	5650 // Return top encoding hint for given string

	5651 Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) {

	5652 string normalized_lang = MakeChar8(string(name));

	5653 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,

	5654 normalized_lang.c_str());

	5655 if (n < 0) {return UNKNOWN_ENCODING;}

	5656

	5657 // Charset is eight bytes, probability table is eight bytes

	5658 int toprankenc =

	5659 TopCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],

	5660 kMaxLangVector);

	5661 return kMapToEncoding[toprankenc];

	5662 }

	5663

	5664 // Return top encoding hint for given string

	5665 Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) {

	5666 string normalized_tld = MakeChar4(string(name));

	5667 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,

	5668 normalized_tld.c_str());

	5669 if (n < 0) {return UNKNOWN_ENCODING;}

	5670

	5671 // TLD is four bytes, probability table is 12 bytes

	5672 int toprankenc =

	5673 TopCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],

	5674 kMaxTldVector);

	5675 return kMapToEncoding[toprankenc];

	5676 }

	5677

	5678 // Return top encoding hint for given string

	5679 Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) {

	5680 string normalized_charset = MakeChar44(string(name));

	5681 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,

	5682 normalized_charset.c_str());

	5683 if (n < 0) {return UNKNOWN_ENCODING;}

	5684

	5685 // Charset is eight bytes, probability table is eight bytes

	5686 int toprankenc =

	5687 TopCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],

	5688 kMaxCharsetVector);

	5689 return kMapToEncoding[toprankenc];

	5690 }

	5691

	5692 const char* CompactEncDet::Version(void) {

	5693 return kVersion;

	5694 }

OLD	NEW