OLD | NEW |
(Empty) | |
| 1 // |
| 2 // Copyright 2006, 2007 Google Inc. All Rights Reserved. |
| 3 // Author: dsites@google.com (Dick Sites) |
| 4 // |
| 5 // Design document: eng/designdocs/i18n/compact_encoding_detector.pdf |
| 6 |
| 7 #ifndef ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__ |
| 8 #define ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__ |
| 9 |
| 10 #include "encodings/public/encodings.h" // for Encoding |
| 11 #include "languages/public/languages.h" // for Language |
| 12 |
| 13 namespace CompactEncDet { |
| 14 // We may want different statistics, depending on whether the text being |
| 15 // identfied is from the web, from email, etc. This is currently ignored, |
| 16 // except WEB_CORPUS enables ignoring chars inside tags. |
| 17 enum TextCorpusType { |
| 18 WEB_CORPUS, |
| 19 XML_CORPUS, |
| 20 QUERY_CORPUS, // Use this for vanilla plaintext |
| 21 EMAIL_CORPUS, |
| 22 NUM_CORPA, // always last |
| 23 }; |
| 24 |
| 25 // Scan raw bytes and detect most likely encoding |
| 26 // Design goals: |
| 27 // Skip over big initial stretches of seven-bit ASCII bytes very quickly |
| 28 // Thread safe |
| 29 // Works equally well on |
| 30 // 50-byte queries, |
| 31 // 5000-byte email and |
| 32 // 50000-byte web pages |
| 33 // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1) |
| 34 // |
| 35 // Inputs: text and text_length |
| 36 // web page's url (preferred) or just |
| 37 // top-level domain name (e.g. "com") or NULL as a hint |
| 38 // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint |
| 39 // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint |
| 40 // an Encoding or UNKNOWN_ENCODING as a hint |
| 41 // a Language or UNKNOWN_LANGUAGE as a hint |
| 42 // corpus type from the list above. Currently ignored; may select |
| 43 // different probability tables in the future |
| 44 // ignore_7bit if true says to NOT return the pure seven-bit encodings |
| 45 // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7. |
| 46 // This may save a little scoring time on pure printable ASCII input text |
| 47 // Outputs: bytes_consumed says how much of text_length was actually examined |
| 48 // is_reliable set true if the returned encoding is at least 2**10 time more |
| 49 // probable then the second-best encoding |
| 50 // Return value: the most likely encoding for the input text |
| 51 // |
| 52 // Setting ignore_7bit_mail_encodings effectively turns off detection of |
| 53 // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true |
| 54 // when corpus_type is QUERY_CORPUS. |
| 55 Encoding DetectEncoding( |
| 56 const char* text, int text_length, const char* url_hint, |
| 57 const char* http_charset_hint, const char* meta_charset_hint, |
| 58 const int encoding_hint, |
| 59 const Language language_hint, // User interface lang |
| 60 const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, |
| 61 int* bytes_consumed, bool* is_reliable); |
| 62 |
| 63 // Support functions for unit test program |
| 64 int BackmapEncodingToRankedEncoding(Encoding enc); |
| 65 Encoding TopEncodingOfLangHint(const char* name); |
| 66 Encoding TopEncodingOfTLDHint(const char* name); |
| 67 Encoding TopEncodingOfCharsetHint(const char* name); |
| 68 const char* Version(void); |
| 69 }; // End namespace CompactEncDet |
| 70 |
| 71 #endif // ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__ |
OLD | NEW |