third_party/cld/encodings/compact_enc_det/compact_enc_det.h - Issue 1956183002: CL for perf tryjob on linux

Unified Diff: third_party/cld/encodings/compact_enc_det/compact_enc_det.h

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/cld/encodings/compact_enc_det/compact_enc_det.h

diff --git a/third_party/cld/encodings/compact_enc_det/compact_enc_det.h b/third_party/cld/encodings/compact_enc_det/compact_enc_det.h

new file mode 100644

index 0000000000000000000000000000000000000000..8dadb7dba4a00e2722cb407595c3915b4368fa74

--- /dev/null

+++ b/third_party/cld/encodings/compact_enc_det/compact_enc_det.h

@@ -0,0 +1,71 @@

+//

+// Author: dsites@google.com (Dick Sites)

+//

+// Design document: eng/designdocs/i18n/compact_encoding_detector.pdf

+#ifndef ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__

+#define ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__

+#include "encodings/public/encodings.h" // for Encoding

+#include "languages/public/languages.h" // for Language

+namespace CompactEncDet {

+ // We may want different statistics, depending on whether the text being

+ // identfied is from the web, from email, etc. This is currently ignored,

+ // except WEB_CORPUS enables ignoring chars inside tags.

+ enum TextCorpusType {

+ WEB_CORPUS,

+ XML_CORPUS,

+ QUERY_CORPUS, // Use this for vanilla plaintext

+ EMAIL_CORPUS,

+ NUM_CORPA, // always last

+ };

+ // Scan raw bytes and detect most likely encoding

+ // Design goals:

+ // Skip over big initial stretches of seven-bit ASCII bytes very quickly

+ // Thread safe

+ // Works equally well on

+ // 50-byte queries,

+ // 5000-byte email and

+ // 50000-byte web pages

+ // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)

+ //

+ // Inputs: text and text_length

+ // web page's url (preferred) or just

+ // top-level domain name (e.g. "com") or NULL as a hint

+ // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint

+ // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint

+ // an Encoding or UNKNOWN_ENCODING as a hint

+ // a Language or UNKNOWN_LANGUAGE as a hint

+ // corpus type from the list above. Currently ignored; may select

+ // different probability tables in the future

+ // ignore_7bit if true says to NOT return the pure seven-bit encodings

+ // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.

+ // This may save a little scoring time on pure printable ASCII input text

+ // Outputs: bytes_consumed says how much of text_length was actually examined

+ // is_reliable set true if the returned encoding is at least 2**10 time more

+ // probable then the second-best encoding

+ // Return value: the most likely encoding for the input text

+ //

+ // Setting ignore_7bit_mail_encodings effectively turns off detection of

+ // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true

+ // when corpus_type is QUERY_CORPUS.

+ Encoding DetectEncoding(

+ const char* text, int text_length, const char* url_hint,

+ const char* http_charset_hint, const char* meta_charset_hint,

+ const int encoding_hint,

+ const Language language_hint, // User interface lang

+ const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,

+ int* bytes_consumed, bool* is_reliable);

+ // Support functions for unit test program

+ int BackmapEncodingToRankedEncoding(Encoding enc);

+ Encoding TopEncodingOfLangHint(const char* name);

+ Encoding TopEncodingOfTLDHint(const char* name);

+ Encoding TopEncodingOfCharsetHint(const char* name);

+ const char* Version(void);

+}; // End namespace CompactEncDet

+#endif // ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__

« no previous file with comments | « third_party/cld/base/varsetter.h ('k') | third_party/cld/encodings/compact_enc_det/compact_enc_det.cc » ('j') | no next file with comments »