OLD | NEW |
(Empty) | |
| 1 // |
| 2 // Copyright 2006 Google Inc. All Rights Reserved. |
| 3 // Author: dsites@google.com (Dick Sites) |
| 4 // |
| 5 |
| 6 |
| 7 #include "encodings/compact_enc_det/compact_enc_det_hint_code.h" |
| 8 |
| 9 #include <ctype.h> // for isalpha |
| 10 #include <string.h> // for NULL, memchr, strlen, etc |
| 11 |
| 12 #include "base/basictypes.h" // for uint8, uint32 |
| 13 //#include "webutil/url/url.h" // for URL |
| 14 |
| 15 // Upper to lower, keep digits, everything else to minus '-' (2d) |
| 16 static const char kCharsetToLowerTbl[256] = { |
| 17 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 18 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 19 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 20 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 21 |
| 22 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x
6f, |
| 23 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 24 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x
6f, |
| 25 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 26 |
| 27 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 28 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 29 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 30 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 31 |
| 32 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 33 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 34 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 35 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x
2d, |
| 36 }; |
| 37 |
| 38 |
| 39 static const char kIsAlpha[256] = { |
| 40 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 41 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 42 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0, |
| 43 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0, |
| 44 |
| 45 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 46 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 47 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 48 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 49 }; |
| 50 |
| 51 static const char kIsDigit[256] = { |
| 52 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 53 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0, |
| 54 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 55 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 56 |
| 57 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 58 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 59 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 60 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 61 }; |
| 62 |
| 63 static const char* kFakeEncodingName[] = { |
| 64 "FakeEnc100", "FakeEnc101", "FakeEnc102", "FakeEnc103", "FakeEnc104", |
| 65 "FakeEnc105", "FakeEnc106", "FakeEnc107", "FakeEnc108", "FakeEnc109", |
| 66 "FakeEnc110", "FakeEnc111", "FakeEnc112", "FakeEnc113", "FakeEnc114", |
| 67 "FakeEnc115", "FakeEnc116", "FakeEnc117", "FakeEnc118", "FakeEnc119", |
| 68 }; |
| 69 static const char* kFakeEncodingName2[] = { |
| 70 "FakeEnc_0", "FakeEnc_1", "FakeEnc_2", "FakeEnc_3", "FakeEnc_4", |
| 71 }; |
| 72 |
| 73 // Return name for extended encoding |
| 74 const char* MyEncodingName(Encoding enc) { |
| 75 if (enc < 0) { |
| 76 return "~"; |
| 77 } |
| 78 if (enc == ISO_8859_1) { |
| 79 return "Latin1"; // I can't stand "ASCII" for this |
| 80 } |
| 81 if (enc < NUM_ENCODINGS) { |
| 82 return EncodingName(enc); |
| 83 } |
| 84 // allow fake names, for exploration |
| 85 if ((NUM_ENCODINGS <= enc) && (enc < (NUM_ENCODINGS + 4))) { |
| 86 return kFakeEncodingName2[enc - NUM_ENCODINGS]; |
| 87 } |
| 88 if ((100 <= enc) && (enc < 120)) { |
| 89 return kFakeEncodingName[enc - 100]; |
| 90 } |
| 91 return "~"; |
| 92 } |
| 93 |
| 94 |
| 95 // http://www.iana.org/assignments/character-sets says charset name is up to |
| 96 // 40 bytes of any printable ASCII, but that can't be right |
| 97 // when parsing HTML; at least quote is not allowed. The list |
| 98 // here includes all punctuation in all registered names as of April 2006 |
| 99 static const char* kWordLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 100 "abcdefghijklmnopqrstuvwxyz" |
| 101 "0123456789" |
| 102 "-_.:()"; |
| 103 |
| 104 |
| 105 // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars |
| 106 // Letters are forced to lowercase ASCII |
| 107 // Used to normalize charset= values |
| 108 string MakeChar44(const string& str) { |
| 109 string res("________"); // eight underscores |
| 110 int l_ptr = 0; |
| 111 int d_ptr = 0; |
| 112 for (int i = 0; i < str.size(); ++i) { |
| 113 uint8 uc = static_cast<uint8>(str[i]); |
| 114 if (kIsAlpha[uc]) { |
| 115 if (l_ptr < 4) { // Else ignore |
| 116 res[l_ptr] = kCharsetToLowerTbl[uc]; |
| 117 l_ptr++; |
| 118 } |
| 119 } else if (kIsDigit[uc]) { |
| 120 if (d_ptr < 4) { |
| 121 res[4 + d_ptr] = kCharsetToLowerTbl[uc]; |
| 122 } else { |
| 123 // Keep last 4 digits by shifting left |
| 124 res[4] = res[5]; |
| 125 res[5] = res[6]; |
| 126 res[6] = res[7]; |
| 127 res[7] = kCharsetToLowerTbl[uc]; |
| 128 } |
| 129 d_ptr++; |
| 130 } // If neither letter nor digit, drop entirely |
| 131 } |
| 132 return res; |
| 133 } |
| 134 |
| 135 // Normalize ASCII string to first 8 alphabetic/digit chars |
| 136 // Letters are forced to lowercase ASCII |
| 137 // Used to normalize TLD values |
| 138 string MakeChar4(const string& str) { |
| 139 string res("____"); // four underscores |
| 140 int l_ptr = 0; |
| 141 for (int i = 0; i < str.size(); ++i) { |
| 142 uint8 uc = static_cast<uint8>(str[i]); |
| 143 if (kIsAlpha[uc] | kIsDigit[uc]) { |
| 144 if (l_ptr < 4) { // Else ignore |
| 145 res[l_ptr] = kCharsetToLowerTbl[uc]; |
| 146 l_ptr++; |
| 147 } |
| 148 } |
| 149 } |
| 150 return res; |
| 151 } |
| 152 |
| 153 // Normalize ASCII string to first 8 alphabetic/digit chars |
| 154 // Letters are forced to lowercase ASCII |
| 155 // Used to normalize TLD values |
| 156 string MakeChar8(const string& str) { |
| 157 string res("________"); // eight dots |
| 158 int l_ptr = 0; |
| 159 for (int i = 0; i < str.size(); ++i) { |
| 160 uint8 uc = static_cast<uint8>(str[i]); |
| 161 if (kIsAlpha[uc] | kIsDigit[uc]) { |
| 162 if (l_ptr < 8) { // Else ignore |
| 163 res[l_ptr] = kCharsetToLowerTbl[uc]; |
| 164 l_ptr++; |
| 165 } |
| 166 } |
| 167 } |
| 168 return res; |
| 169 } |
| 170 |
| 171 // A-Z to a-z and all non-digits-letters to minus '-' |
| 172 void StringToLowercase(string* str) { |
| 173 for (int i = 0; i < str->size(); i++) { |
| 174 (*str)[i] = kCharsetToLowerTbl[static_cast<uint8>((*str)[i])]; |
| 175 } |
| 176 } |
| 177 |
| 178 bool AllDigits(const string& str, int wordstart_offset, int len) { |
| 179 for (int i = 0; i < len; i++) { |
| 180 char c = str[wordstart_offset + i]; |
| 181 if ('9' < c) {return false;} |
| 182 if (c < '0') {return false;} |
| 183 } |
| 184 return true; |
| 185 } |
| 186 |
| 187 |
| 188 inline char lower(char c) { return c >= 'A' && c <= 'Z' ? c - 'A' + 'a' : c; } |
| 189 inline char upper(char c) { return c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c; } |
| 190 |
| 191 // This is like strstr, but without assuming that the char* is null-terminated, |
| 192 // and the comparison is case-insensitive. (Real files have "Meta", "meta", |
| 193 // and "META". Some files have more than one version. Likewise for "charset".) |
| 194 const char* FindSubstring(const char* start, |
| 195 const char* end, |
| 196 const string& substring) { |
| 197 const char* const sub = substring.data(); |
| 198 const int sublen = substring.size(); |
| 199 const char first_lower = lower(*sub); |
| 200 const char first_upper = upper(*sub); |
| 201 const char* const rest = sub + 1; |
| 202 const int rest_len = sublen - 1; |
| 203 end -= sublen; |
| 204 |
| 205 while (start < end) { |
| 206 // Find the first occurrence of the lowercase version of the first |
| 207 // letter of the substring. |
| 208 const char* p = |
| 209 static_cast<const char*>(memchr(start, first_lower, end - start)); |
| 210 if (first_lower != first_upper) { |
| 211 // If that's a letter, look for the uppercase version, too. |
| 212 const char* q = |
| 213 static_cast<const char*>(memchr(start, first_upper, end - start)); |
| 214 if (p == NULL || |
| 215 (q != NULL && q < p)) { |
| 216 p = q; // Uppercase came first. |
| 217 } |
| 218 } |
| 219 if (p == NULL) return NULL; |
| 220 if (strncasecmp(p + 1, rest, rest_len) == 0) return p; |
| 221 start = p + 1; |
| 222 } |
| 223 return NULL; |
| 224 } |
| 225 |
| 226 size_t FindSubstring(const string& str, const string& substring) { |
| 227 const char* data = str.data(); |
| 228 const char* z = FindSubstring(data, data + str.size(), substring); |
| 229 return z == NULL ? string::npos : z - data; |
| 230 } |
| 231 |
| 232 // Get charset value from string |
| 233 // Normalize: truncate to 16 chars and make lowercase |
| 234 string GetCharset(const string& str) { |
| 235 auto charset_offset = FindSubstring(str, "charset"); |
| 236 if (charset_offset == string::npos) { |
| 237 charset_offset = FindSubstring(str, "encoding"); |
| 238 if (charset_offset == string::npos) { |
| 239 return ""; |
| 240 } |
| 241 } |
| 242 int eq_offset = str.find("=", charset_offset); |
| 243 if (eq_offset == string::npos) { |
| 244 return ""; |
| 245 } |
| 246 // skip same-line whitespace and quote after equal |
| 247 int wordstart_offset = str.find_first_not_of(" \t\"\'", eq_offset + 1); |
| 248 if (wordstart_offset == string::npos) { |
| 249 return ""; |
| 250 } |
| 251 int len = str.length() - wordstart_offset; |
| 252 int wordend_offset = str.find_first_not_of(kWordLetters, wordstart_offset); |
| 253 if (wordend_offset != string::npos) { |
| 254 len = wordend_offset - wordstart_offset; |
| 255 } |
| 256 |
| 257 // If too long, it must be bogus |
| 258 if (18 < len) { |
| 259 return ""; |
| 260 } |
| 261 // If <= 1 char, it must be bogus |
| 262 if (len <= 1) { |
| 263 return ""; |
| 264 } |
| 265 // If all digits and less than 3 or more than 6 digits, it must be bogus |
| 266 if (AllDigits(str, wordstart_offset, len) && ((len < 3) || (6 < len))) { |
| 267 return ""; |
| 268 } |
| 269 |
| 270 // Extract and convert to lowercase (converting punct to '-') |
| 271 string charset(str.substr(wordstart_offset, len)); |
| 272 StringToLowercase(&charset); |
| 273 |
| 274 // Strip common prefixes - x- 3d |
| 275 while ((charset.size() > 2) && |
| 276 (charset[0] == '3') && |
| 277 (charset[1] == 'd')) { |
| 278 charset.erase(0, 2); |
| 279 } |
| 280 while ((charset.size() > 2) && |
| 281 (charset[0] == 'x') && |
| 282 (charset[1] == '-')) { |
| 283 charset.erase(0, 2); |
| 284 } |
| 285 while ((charset.size() > 1) && |
| 286 (charset[0] == '-')) { |
| 287 charset.erase(0, 1); |
| 288 } |
| 289 |
| 290 |
| 291 // Strip common suffixes - -80 -19xx -200x |
| 292 while ((charset.size() > 1) && |
| 293 (charset[charset.size() - 1] == '-')) { |
| 294 charset.erase(charset.size() - 1, 1); |
| 295 } |
| 296 |
| 297 if ((charset.size() > 3) && |
| 298 (charset[charset.size() - 3] == '-') && |
| 299 (charset[charset.size() - 2] == '8') && |
| 300 (charset[charset.size() - 1] == '0')) { |
| 301 charset.erase(charset.size() - 3, 3); |
| 302 } |
| 303 if ((charset.size() > 5) && |
| 304 (charset[charset.size() - 5] == '-') && |
| 305 (charset[charset.size() - 4] == '1') && |
| 306 (charset[charset.size() - 3] == '9')) { |
| 307 charset.erase(charset.size() - 5, 5); |
| 308 } |
| 309 if ((charset.size() > 5) && |
| 310 (charset[charset.size() - 5] == '-') && |
| 311 (charset[charset.size() - 4] == '2') && |
| 312 (charset[charset.size() - 3] == '0') && |
| 313 (charset[charset.size() - 2] == '0')) { |
| 314 charset.erase(charset.size() - 5, 5); |
| 315 } |
| 316 |
| 317 // Truncate |
| 318 if (charset.size() > 16) { |
| 319 charset.resize(16); |
| 320 } |
| 321 |
| 322 return charset; |
| 323 } |
| 324 |
| 325 int GetHttpHeaderLength(const char* document_text, uint32 document_length) { |
| 326 // HTTP headers end with cr lf cr lf |
| 327 const char* end = FindSubstring(document_text, |
| 328 document_text + document_length, |
| 329 "\r\n\r\n"); |
| 330 return end |
| 331 ? end - document_text + 4 // skip over the cr lf cr lf |
| 332 : 0; |
| 333 } |
| 334 /* |
| 335 // Get top level domain from URL |
| 336 // Normalize: truncate to 16 chars and make lowercase |
| 337 string GetTLD(const char* url_str) { |
| 338 // some of urls are escaped, we need to unescape them. Otherwise |
| 339 // you will see the messy TLDs. |
| 340 if (url_str == NULL) { |
| 341 return string(""); |
| 342 } |
| 343 |
| 344 string unescaped_url; |
| 345 URL::UnescapeURL(url_str, strlen(url_str), &unescaped_url); |
| 346 URL url(unescaped_url); |
| 347 const char *hostname = url.host(); |
| 348 const char *lastdot = strrchr(hostname, '.'); |
| 349 if (lastdot == NULL) { |
| 350 // no dot in host; maybe it's not a fully qualified host name |
| 351 return ""; |
| 352 } |
| 353 |
| 354 const char *tld_str = lastdot + 1; |
| 355 // TLD can only have letters |
| 356 for (const char *p = tld_str; *p != '\0'; ++p) { |
| 357 if (!isalpha(*p)) { |
| 358 return ""; |
| 359 } |
| 360 } |
| 361 |
| 362 string tld(tld_str); |
| 363 // Truncate |
| 364 if (tld.size() > 16) { |
| 365 tld.resize(16); |
| 366 } |
| 367 StringToLowercase(&tld); |
| 368 |
| 369 return tld; |
| 370 } |
| 371 |
| 372 // Get charset from HTTP headers |
| 373 // Normalize: truncate to 16 chars and make lowercase |
| 374 string GetCharsetFromHttp(const char* http, int http_len) { |
| 375 if (FindSubstring(http, http + http_len, "charset")) { |
| 376 string headers(http, http_len); |
| 377 return GetCharset(headers); |
| 378 } |
| 379 return ""; |
| 380 } |
| 381 |
| 382 // Get charset= from <meta> tag |
| 383 // Or get encoding= from <?xml?> tag |
| 384 // <?xml version="1.0" encoding="ISO-8859-1" standalone="no"?> |
| 385 // Normalize: truncate to 16 chars and make lowercase |
| 386 string GetCharsetFromMeta(const char* body, int body_len) { |
| 387 const char* start = body; |
| 388 const char* const end = start + body_len; |
| 389 while (start < end) { |
| 390 const char* meta = FindSubstring(start, end, "<meta "); |
| 391 if (meta == NULL) { |
| 392 break; |
| 393 } |
| 394 const char* endtag = FindSubstring(meta, end, ">"); |
| 395 if (endtag == NULL) { |
| 396 break; |
| 397 } |
| 398 if (endtag - meta > 1024) { |
| 399 endtag = meta + 1024; |
| 400 } |
| 401 const char* meta_end = endtag + 1; |
| 402 if (FindSubstring(meta, meta_end, "charset") != NULL) { |
| 403 return GetCharset(string(meta, meta_end - meta)); |
| 404 } |
| 405 start = meta_end; |
| 406 } |
| 407 |
| 408 start = body; |
| 409 while (start < end) { |
| 410 const char* meta = FindSubstring(start, end, "<?xml "); |
| 411 if (meta == NULL) { |
| 412 break; |
| 413 } |
| 414 const char* endtag = FindSubstring(meta, end, ">"); |
| 415 if (endtag == NULL) { |
| 416 break; |
| 417 } |
| 418 if (endtag - meta > 1024) { |
| 419 endtag = meta + 1024; |
| 420 } |
| 421 const char* meta_end = endtag + 1; |
| 422 if (FindSubstring(meta, meta_end, "encoding") != NULL) { |
| 423 return GetCharset(string(meta, meta_end - meta)); |
| 424 } |
| 425 start = meta_end; |
| 426 } |
| 427 |
| 428 return ""; |
| 429 } |
| 430 */ |
OLD | NEW |