third_party/cld/encodings/compact_enc_det/compact_enc_det_hint_code.cc - Issue 1956183002: CL for perf tryjob on linux

Side by Side Diff: third_party/cld/encodings/compact_enc_det/compact_enc_det_hint_code.cc

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/cld/encodings/compact_enc_det/compact_enc_det_hint_code.h ('k') | third_party/cld/encodings/compact_enc_det/compact_enc_det_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 //

	2 // Copyright 2006 Google Inc. All Rights Reserved.

	3 // Author: dsites@google.com (Dick Sites)

	4 //

	5

	6

	7 #include "encodings/compact_enc_det/compact_enc_det_hint_code.h"

	8

	9 #include <ctype.h> // for isalpha

	10 #include <string.h> // for NULL, memchr, strlen, etc

	11

	12 #include "base/basictypes.h" // for uint8, uint32

	13 //#include "webutil/url/url.h" // for URL

	14

	15 // Upper to lower, keep digits, everything else to minus '-' (2d)

	16 static const char kCharsetToLowerTbl[256] = {

	17 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	18 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	19 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	20 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	21

	22 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x 6f,

	23 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x 2d,

	24 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x 6f,

	25 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x 2d,

	26

	27 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	28 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	29 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	30 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	31

	32 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	33 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	34 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	35 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x 2d,

	36 };

	37

	38

	39 static const char kIsAlpha[256] = {

	40 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	41 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	42 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,

	43 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,

	44

	45 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	46 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	47 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	48 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	49 };

	50

	51 static const char kIsDigit[256] = {

	52 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	53 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,

	54 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	55 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	56

	57 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	58 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	59 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	60 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

	61 };

	62

	63 static const char* kFakeEncodingName[] = {

	64 "FakeEnc100", "FakeEnc101", "FakeEnc102", "FakeEnc103", "FakeEnc104",

	65 "FakeEnc105", "FakeEnc106", "FakeEnc107", "FakeEnc108", "FakeEnc109",

	66 "FakeEnc110", "FakeEnc111", "FakeEnc112", "FakeEnc113", "FakeEnc114",

	67 "FakeEnc115", "FakeEnc116", "FakeEnc117", "FakeEnc118", "FakeEnc119",

	68 };

	69 static const char* kFakeEncodingName2[] = {

	70 "FakeEnc_0", "FakeEnc_1", "FakeEnc_2", "FakeEnc_3", "FakeEnc_4",

	71 };

	72

	73 // Return name for extended encoding

	74 const char* MyEncodingName(Encoding enc) {

	75 if (enc < 0) {

	76 return "~";

	77 }

	78 if (enc == ISO_8859_1) {

	79 return "Latin1"; // I can't stand "ASCII" for this

	80 }

	81 if (enc < NUM_ENCODINGS) {

	82 return EncodingName(enc);

	83 }

	84 // allow fake names, for exploration

	85 if ((NUM_ENCODINGS <= enc) && (enc < (NUM_ENCODINGS + 4))) {

	86 return kFakeEncodingName2[enc - NUM_ENCODINGS];

	87 }

	88 if ((100 <= enc) && (enc < 120)) {

	89 return kFakeEncodingName[enc - 100];

	90 }

	91 return "~";

	92 }

	93

	94

	95 // http://www.iana.org/assignments/character-sets says charset name is up to

	96 // 40 bytes of any printable ASCII, but that can't be right

	97 // when parsing HTML; at least quote is not allowed. The list

	98 // here includes all punctuation in all registered names as of April 2006

	99 static const char* kWordLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

	100 "abcdefghijklmnopqrstuvwxyz"

	101 "0123456789"

	102 "-_.:()";

	103

	104

	105 // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars

	106 // Letters are forced to lowercase ASCII

	107 // Used to normalize charset= values

	108 string MakeChar44(const string& str) {

	109 string res("________"); // eight underscores

	110 int l_ptr = 0;

	111 int d_ptr = 0;

	112 for (int i = 0; i < str.size(); ++i) {

	113 uint8 uc = static_cast<uint8>(str[i]);

	114 if (kIsAlpha[uc]) {

	115 if (l_ptr < 4) { // Else ignore

	116 res[l_ptr] = kCharsetToLowerTbl[uc];

	117 l_ptr++;

	118 }

	119 } else if (kIsDigit[uc]) {

	120 if (d_ptr < 4) {

	121 res[4 + d_ptr] = kCharsetToLowerTbl[uc];

	122 } else {

	123 // Keep last 4 digits by shifting left

	124 res[4] = res[5];

	125 res[5] = res[6];

	126 res[6] = res[7];

	127 res[7] = kCharsetToLowerTbl[uc];

	128 }

	129 d_ptr++;

	130 } // If neither letter nor digit, drop entirely

	131 }

	132 return res;

	133 }

	134

	135 // Normalize ASCII string to first 8 alphabetic/digit chars

	136 // Letters are forced to lowercase ASCII

	137 // Used to normalize TLD values

	138 string MakeChar4(const string& str) {

	139 string res("____"); // four underscores

	140 int l_ptr = 0;

	141 for (int i = 0; i < str.size(); ++i) {

	142 uint8 uc = static_cast<uint8>(str[i]);

	143 if (kIsAlpha[uc] \| kIsDigit[uc]) {

	144 if (l_ptr < 4) { // Else ignore

	145 res[l_ptr] = kCharsetToLowerTbl[uc];

	146 l_ptr++;

	147 }

	148 }

	149 }

	150 return res;

	151 }

	152

	153 // Normalize ASCII string to first 8 alphabetic/digit chars

	154 // Letters are forced to lowercase ASCII

	155 // Used to normalize TLD values

	156 string MakeChar8(const string& str) {

	157 string res("________"); // eight dots

	158 int l_ptr = 0;

	159 for (int i = 0; i < str.size(); ++i) {

	160 uint8 uc = static_cast<uint8>(str[i]);

	161 if (kIsAlpha[uc] \| kIsDigit[uc]) {

	162 if (l_ptr < 8) { // Else ignore

	163 res[l_ptr] = kCharsetToLowerTbl[uc];

	164 l_ptr++;

	165 }

	166 }

	167 }

	168 return res;

	169 }

	170

	171 // A-Z to a-z and all non-digits-letters to minus '-'

	172 void StringToLowercase(string* str) {

	173 for (int i = 0; i < str->size(); i++) {

	174 (str)[i] = kCharsetToLowerTbl[static_cast<uint8>((str)[i])];

	175 }

	176 }

	177

	178 bool AllDigits(const string& str, int wordstart_offset, int len) {

	179 for (int i = 0; i < len; i++) {

	180 char c = str[wordstart_offset + i];

	181 if ('9' < c) {return false;}

	182 if (c < '0') {return false;}

	183 }

	184 return true;

	185 }

	186

	187

	188 inline char lower(char c) { return c >= 'A' && c <= 'Z' ? c - 'A' + 'a' : c; }

	189 inline char upper(char c) { return c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c; }

	190

	191 // This is like strstr, but without assuming that the char* is null-terminated,

	192 // and the comparison is case-insensitive. (Real files have "Meta", "meta",

	193 // and "META". Some files have more than one version. Likewise for "charset".)

	194 const char* FindSubstring(const char* start,

	195 const char* end,

	196 const string& substring) {

	197 const char* const sub = substring.data();

	198 const int sublen = substring.size();

	199 const char first_lower = lower(*sub);

	200 const char first_upper = upper(*sub);

	201 const char* const rest = sub + 1;

	202 const int rest_len = sublen - 1;

	203 end -= sublen;

	204

	205 while (start < end) {

	206 // Find the first occurrence of the lowercase version of the first

	207 // letter of the substring.

	208 const char* p =

	209 static_cast<const char*>(memchr(start, first_lower, end - start));

	210 if (first_lower != first_upper) {

	211 // If that's a letter, look for the uppercase version, too.

	212 const char* q =

	213 static_cast<const char*>(memchr(start, first_upper, end - start));

	214 if (p == NULL \|\|

	215 (q != NULL && q < p)) {

	216 p = q; // Uppercase came first.

	217 }

	218 }

	219 if (p == NULL) return NULL;

	220 if (strncasecmp(p + 1, rest, rest_len) == 0) return p;

	221 start = p + 1;

	222 }

	223 return NULL;

	224 }

	225

	226 size_t FindSubstring(const string& str, const string& substring) {

	227 const char* data = str.data();

	228 const char* z = FindSubstring(data, data + str.size(), substring);

	229 return z == NULL ? string::npos : z - data;

	230 }

	231

	232 // Get charset value from string

	233 // Normalize: truncate to 16 chars and make lowercase

	234 string GetCharset(const string& str) {

	235 auto charset_offset = FindSubstring(str, "charset");

	236 if (charset_offset == string::npos) {

	237 charset_offset = FindSubstring(str, "encoding");

	238 if (charset_offset == string::npos) {

	239 return "";

	240 }

	241 }

	242 int eq_offset = str.find("=", charset_offset);

	243 if (eq_offset == string::npos) {

	244 return "";

	245 }

	246 // skip same-line whitespace and quote after equal

	247 int wordstart_offset = str.find_first_not_of(" \t\"\'", eq_offset + 1);

	248 if (wordstart_offset == string::npos) {

	249 return "";

	250 }

	251 int len = str.length() - wordstart_offset;

	252 int wordend_offset = str.find_first_not_of(kWordLetters, wordstart_offset);

	253 if (wordend_offset != string::npos) {

	254 len = wordend_offset - wordstart_offset;

	255 }

	256

	257 // If too long, it must be bogus

	258 if (18 < len) {

	259 return "";

	260 }

	261 // If <= 1 char, it must be bogus

	262 if (len <= 1) {

	263 return "";

	264 }

	265 // If all digits and less than 3 or more than 6 digits, it must be bogus

	266 if (AllDigits(str, wordstart_offset, len) && ((len < 3) \|\| (6 < len))) {

	267 return "";

	268 }

	269

	270 // Extract and convert to lowercase (converting punct to '-')

	271 string charset(str.substr(wordstart_offset, len));

	272 StringToLowercase(&charset);

	273

	274 // Strip common prefixes - x- 3d

	275 while ((charset.size() > 2) &&

	276 (charset[0] == '3') &&

	277 (charset[1] == 'd')) {

	278 charset.erase(0, 2);

	279 }

	280 while ((charset.size() > 2) &&

	281 (charset[0] == 'x') &&

	282 (charset[1] == '-')) {

	283 charset.erase(0, 2);

	284 }

	285 while ((charset.size() > 1) &&

	286 (charset[0] == '-')) {

	287 charset.erase(0, 1);

	288 }

	289

	290

	291 // Strip common suffixes - -80 -19xx -200x

	292 while ((charset.size() > 1) &&

	293 (charset[charset.size() - 1] == '-')) {

	294 charset.erase(charset.size() - 1, 1);

	295 }

	296

	297 if ((charset.size() > 3) &&

	298 (charset[charset.size() - 3] == '-') &&

	299 (charset[charset.size() - 2] == '8') &&

	300 (charset[charset.size() - 1] == '0')) {

	301 charset.erase(charset.size() - 3, 3);

	302 }

	303 if ((charset.size() > 5) &&

	304 (charset[charset.size() - 5] == '-') &&

	305 (charset[charset.size() - 4] == '1') &&

	306 (charset[charset.size() - 3] == '9')) {

	307 charset.erase(charset.size() - 5, 5);

	308 }

	309 if ((charset.size() > 5) &&

	310 (charset[charset.size() - 5] == '-') &&

	311 (charset[charset.size() - 4] == '2') &&

	312 (charset[charset.size() - 3] == '0') &&

	313 (charset[charset.size() - 2] == '0')) {

	314 charset.erase(charset.size() - 5, 5);

	315 }

	316

	317 // Truncate

	318 if (charset.size() > 16) {

	319 charset.resize(16);

	320 }

	321

	322 return charset;

	323 }

	324

	325 int GetHttpHeaderLength(const char* document_text, uint32 document_length) {

	326 // HTTP headers end with cr lf cr lf

	327 const char* end = FindSubstring(document_text,

	328 document_text + document_length,

	329 "\r\n\r\n");

	330 return end

	331 ? end - document_text + 4 // skip over the cr lf cr lf

	332 : 0;

	333 }

	334 /*

	335 // Get top level domain from URL

	336 // Normalize: truncate to 16 chars and make lowercase

	337 string GetTLD(const char* url_str) {

	338 // some of urls are escaped, we need to unescape them. Otherwise

	339 // you will see the messy TLDs.

	340 if (url_str == NULL) {

	341 return string("");

	342 }

	343

	344 string unescaped_url;

	345 URL::UnescapeURL(url_str, strlen(url_str), &unescaped_url);

	346 URL url(unescaped_url);

	347 const char *hostname = url.host();

	348 const char *lastdot = strrchr(hostname, '.');

	349 if (lastdot == NULL) {

	350 // no dot in host; maybe it's not a fully qualified host name

	351 return "";

	352 }

	353

	354 const char *tld_str = lastdot + 1;

	355 // TLD can only have letters

	356 for (const char p = tld_str; p != '\0'; ++p) {

	357 if (!isalpha(*p)) {

	358 return "";

	359 }

	360 }

	361

	362 string tld(tld_str);

	363 // Truncate

	364 if (tld.size() > 16) {

	365 tld.resize(16);

	366 }

	367 StringToLowercase(&tld);

	368

	369 return tld;

	370 }

	371

	372 // Get charset from HTTP headers

	373 // Normalize: truncate to 16 chars and make lowercase

	374 string GetCharsetFromHttp(const char* http, int http_len) {

	375 if (FindSubstring(http, http + http_len, "charset")) {

	376 string headers(http, http_len);

	377 return GetCharset(headers);

	378 }

	379 return "";

	380 }

	381

	382 // Get charset= from <meta> tag

	383 // Or get encoding= from <?xml?> tag

	384 // <?xml version="1.0" encoding="ISO-8859-1" standalone="no"?>

	385 // Normalize: truncate to 16 chars and make lowercase

	386 string GetCharsetFromMeta(const char* body, int body_len) {

	387 const char* start = body;

	388 const char* const end = start + body_len;

	389 while (start < end) {

	390 const char* meta = FindSubstring(start, end, "<meta ");

	391 if (meta == NULL) {

	392 break;

	393 }

	394 const char* endtag = FindSubstring(meta, end, ">");

	395 if (endtag == NULL) {

	396 break;

	397 }

	398 if (endtag - meta > 1024) {

	399 endtag = meta + 1024;

	400 }

	401 const char* meta_end = endtag + 1;

	402 if (FindSubstring(meta, meta_end, "charset") != NULL) {

	403 return GetCharset(string(meta, meta_end - meta));

	404 }

	405 start = meta_end;

	406 }

	407

	408 start = body;

	409 while (start < end) {

	410 const char* meta = FindSubstring(start, end, "<?xml ");

	411 if (meta == NULL) {

	412 break;

	413 }

	414 const char* endtag = FindSubstring(meta, end, ">");

	415 if (endtag == NULL) {

	416 break;

	417 }

	418 if (endtag - meta > 1024) {

	419 endtag = meta + 1024;

	420 }

	421 const char* meta_end = endtag + 1;

	422 if (FindSubstring(meta, meta_end, "encoding") != NULL) {

	423 return GetCharset(string(meta, meta_end - meta));

	424 }

	425 start = meta_end;

	426 }

	427

	428 return "";

	429 }

	430 */

OLD	NEW