Index: third_party/cld/encodings/compact_enc_det/compact_enc_det.h |
diff --git a/third_party/cld/encodings/compact_enc_det/compact_enc_det.h b/third_party/cld/encodings/compact_enc_det/compact_enc_det.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..8dadb7dba4a00e2722cb407595c3915b4368fa74 |
--- /dev/null |
+++ b/third_party/cld/encodings/compact_enc_det/compact_enc_det.h |
@@ -0,0 +1,71 @@ |
+// |
+// Copyright 2006, 2007 Google Inc. All Rights Reserved. |
+// Author: dsites@google.com (Dick Sites) |
+// |
+// Design document: eng/designdocs/i18n/compact_encoding_detector.pdf |
+ |
+#ifndef ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__ |
+#define ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__ |
+ |
+#include "encodings/public/encodings.h" // for Encoding |
+#include "languages/public/languages.h" // for Language |
+ |
+namespace CompactEncDet { |
+ // We may want different statistics, depending on whether the text being |
+ // identfied is from the web, from email, etc. This is currently ignored, |
+ // except WEB_CORPUS enables ignoring chars inside tags. |
+ enum TextCorpusType { |
+ WEB_CORPUS, |
+ XML_CORPUS, |
+ QUERY_CORPUS, // Use this for vanilla plaintext |
+ EMAIL_CORPUS, |
+ NUM_CORPA, // always last |
+ }; |
+ |
+ // Scan raw bytes and detect most likely encoding |
+ // Design goals: |
+ // Skip over big initial stretches of seven-bit ASCII bytes very quickly |
+ // Thread safe |
+ // Works equally well on |
+ // 50-byte queries, |
+ // 5000-byte email and |
+ // 50000-byte web pages |
+ // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1) |
+ // |
+ // Inputs: text and text_length |
+ // web page's url (preferred) or just |
+ // top-level domain name (e.g. "com") or NULL as a hint |
+ // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint |
+ // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint |
+ // an Encoding or UNKNOWN_ENCODING as a hint |
+ // a Language or UNKNOWN_LANGUAGE as a hint |
+ // corpus type from the list above. Currently ignored; may select |
+ // different probability tables in the future |
+ // ignore_7bit if true says to NOT return the pure seven-bit encodings |
+ // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7. |
+ // This may save a little scoring time on pure printable ASCII input text |
+ // Outputs: bytes_consumed says how much of text_length was actually examined |
+ // is_reliable set true if the returned encoding is at least 2**10 time more |
+ // probable then the second-best encoding |
+ // Return value: the most likely encoding for the input text |
+ // |
+ // Setting ignore_7bit_mail_encodings effectively turns off detection of |
+ // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true |
+ // when corpus_type is QUERY_CORPUS. |
+ Encoding DetectEncoding( |
+ const char* text, int text_length, const char* url_hint, |
+ const char* http_charset_hint, const char* meta_charset_hint, |
+ const int encoding_hint, |
+ const Language language_hint, // User interface lang |
+ const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, |
+ int* bytes_consumed, bool* is_reliable); |
+ |
+ // Support functions for unit test program |
+ int BackmapEncodingToRankedEncoding(Encoding enc); |
+ Encoding TopEncodingOfLangHint(const char* name); |
+ Encoding TopEncodingOfTLDHint(const char* name); |
+ Encoding TopEncodingOfCharsetHint(const char* name); |
+ const char* Version(void); |
+}; // End namespace CompactEncDet |
+ |
+#endif // ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__ |