Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(21)

Side by Side Diff: third_party/cld/encodings/compact_enc_det/compact_enc_det.h

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 //
2 // Copyright 2006, 2007 Google Inc. All Rights Reserved.
3 // Author: dsites@google.com (Dick Sites)
4 //
5 // Design document: eng/designdocs/i18n/compact_encoding_detector.pdf
6
7 #ifndef ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__
8 #define ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__
9
10 #include "encodings/public/encodings.h" // for Encoding
11 #include "languages/public/languages.h" // for Language
12
13 namespace CompactEncDet {
14 // We may want different statistics, depending on whether the text being
15 // identfied is from the web, from email, etc. This is currently ignored,
16 // except WEB_CORPUS enables ignoring chars inside tags.
17 enum TextCorpusType {
18 WEB_CORPUS,
19 XML_CORPUS,
20 QUERY_CORPUS, // Use this for vanilla plaintext
21 EMAIL_CORPUS,
22 NUM_CORPA, // always last
23 };
24
25 // Scan raw bytes and detect most likely encoding
26 // Design goals:
27 // Skip over big initial stretches of seven-bit ASCII bytes very quickly
28 // Thread safe
29 // Works equally well on
30 // 50-byte queries,
31 // 5000-byte email and
32 // 50000-byte web pages
33 // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
34 //
35 // Inputs: text and text_length
36 // web page's url (preferred) or just
37 // top-level domain name (e.g. "com") or NULL as a hint
38 // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
39 // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
40 // an Encoding or UNKNOWN_ENCODING as a hint
41 // a Language or UNKNOWN_LANGUAGE as a hint
42 // corpus type from the list above. Currently ignored; may select
43 // different probability tables in the future
44 // ignore_7bit if true says to NOT return the pure seven-bit encodings
45 // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
46 // This may save a little scoring time on pure printable ASCII input text
47 // Outputs: bytes_consumed says how much of text_length was actually examined
48 // is_reliable set true if the returned encoding is at least 2**10 time more
49 // probable then the second-best encoding
50 // Return value: the most likely encoding for the input text
51 //
52 // Setting ignore_7bit_mail_encodings effectively turns off detection of
53 // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
54 // when corpus_type is QUERY_CORPUS.
55 Encoding DetectEncoding(
56 const char* text, int text_length, const char* url_hint,
57 const char* http_charset_hint, const char* meta_charset_hint,
58 const int encoding_hint,
59 const Language language_hint, // User interface lang
60 const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
61 int* bytes_consumed, bool* is_reliable);
62
63 // Support functions for unit test program
64 int BackmapEncodingToRankedEncoding(Encoding enc);
65 Encoding TopEncodingOfLangHint(const char* name);
66 Encoding TopEncodingOfTLDHint(const char* name);
67 Encoding TopEncodingOfCharsetHint(const char* name);
68 const char* Version(void);
69 }; // End namespace CompactEncDet
70
71 #endif // ENCODINGS_COMPACT_ENC_DET_COMPACT_ENC_DET_H__
OLDNEW
« no previous file with comments | « third_party/cld/base/varsetter.h ('k') | third_party/cld/encodings/compact_enc_det/compact_enc_det.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698