icu46/source/i18n/csrmbcs.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/csrmbcs.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/csrmbcs.h

===================================================================

--- icu46/source/i18n/csrmbcs.h (revision 0)

+++ icu46/source/i18n/csrmbcs.h (revision 0)

@@ -0,0 +1,205 @@

+/*

+ **********************************************************************

+ */

+#ifndef __CSRMBCS_H

+#define __CSRMBCS_H

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_CONVERSION

+#include "csrecog.h"

+U_NAMESPACE_BEGIN

+// "Character" iterated character class.

+// Recognizers for specific mbcs encodings make their "characters" available

+// by providing a nextChar() function that fills in an instance of IteratedChar

+// with the next char from the input.

+// The returned characters are not converted to Unicode, but remain as the raw

+// bytes (concatenated into an int) from the codepage data.

+//

+// For Asian charsets, use the raw input rather than the input that has been

+// stripped of markup. Detection only considers multi-byte chars, effectively

+// stripping markup anyway, and double byte chars do occur in markup too.

+//

+class IteratedChar : public UMemory

+public:

+ uint32_t charValue; // 1-4 bytes from the raw input data

+ int32_t index;

+ int32_t nextIndex;

+ UBool error;

+ UBool done;

+public:

+ IteratedChar();

+ //void reset();

+ int32_t nextByte(InputText* det);

+};

+class CharsetRecog_mbcs : public CharsetRecognizer {

+protected:

+ /**

+ * Test the match of this charset with the input text data

+ * which is obtained via the CharsetDetector object.

+ *

+ * @param det The CharsetDetector, which contains the input text

+ * to be checked for being in this charset.

+ * @return Two values packed into one int (Damn java, anyhow)

+ * <br/>

+ * bits 0-7: the match confidence, ranging from 0-100

+ * <br/>

+ * bits 8-15: The match reason, an enum-like value.

+ */

+ int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen);

+public:

+ virtual ~CharsetRecog_mbcs();

+ /**

+ * Get the IANA name of this charset.

+ * @return the charset name.

+ */

+ const char *getName() const = 0;

+ const char *getLanguage() const = 0;

+ int32_t match(InputText* det) = 0;

+ /**

+ * Get the next character (however many bytes it is) from the input data

+ * Subclasses for specific charset encodings must implement this function

+ * to get characters according to the rules of their encoding scheme.

+ *

+ * This function is not a method of class IteratedChar only because

+ * that would require a lot of extra derived classes, which is awkward.

+ * @param it The IteratedChar "struct" into which the returned char is placed.

+ * @param det The charset detector, which is needed to get at the input byte data

+ * being iterated over.

+ * @return True if a character was returned, false at end of input.

+ */

+ virtual UBool nextChar(IteratedChar *it, InputText *textIn) = 0;

+};

+/**

+ * Shift-JIS charset recognizer.

+ *

+ */

+class CharsetRecog_sjis : public CharsetRecog_mbcs {

+public:

+ virtual ~CharsetRecog_sjis();

+ UBool nextChar(IteratedChar *it, InputText *det);

+ int32_t match(InputText *det);

+ const char *getName() const;

+ const char *getLanguage() const;

+};

+/**

+ * EUC charset recognizers. One abstract class that provides the common function

+ * for getting the next character according to the EUC encoding scheme,

+ * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.

+ *

+ */

+class CharsetRecog_euc : public CharsetRecog_mbcs

+public:

+ virtual ~CharsetRecog_euc();

+ const char *getName() const = 0;

+ const char *getLanguage() const = 0;

+ int32_t match(InputText* det) = 0;

+ /*

+ * (non-Javadoc)

+ * Get the next character value for EUC based encodings.

+ * Character "value" is simply the raw bytes that make up the character

+ * packed into an int.

+ */

+ UBool nextChar(IteratedChar *it, InputText *det);

+};

+/**

+ * The charset recognize for EUC-JP. A singleton instance of this class

+ * is created and kept by the public CharsetDetector class

+ */

+class CharsetRecog_euc_jp : public CharsetRecog_euc

+public:

+ virtual ~CharsetRecog_euc_jp();

+ const char *getName() const;

+ const char *getLanguage() const;

+ int32_t match(InputText *det);

+};

+/**

+ * The charset recognize for EUC-KR. A singleton instance of this class

+ * is created and kept by the public CharsetDetector class

+ */

+class CharsetRecog_euc_kr : public CharsetRecog_euc

+public:

+ virtual ~CharsetRecog_euc_kr();

+ const char *getName() const;

+ const char *getLanguage() const;

+ int32_t match(InputText *det);

+};

+/**

+ *

+ * Big5 charset recognizer.

+ *

+ */

+class CharsetRecog_big5 : public CharsetRecog_mbcs

+public:

+ virtual ~CharsetRecog_big5();

+ UBool nextChar(IteratedChar* it, InputText* det);

+ const char *getName() const;

+ const char *getLanguage() const;

+ int32_t match(InputText *det);

+};

+/**

+ *

+ * GB-18030 recognizer. Uses simplified Chinese statistics.

+ *

+ */

+class CharsetRecog_gb_18030 : public CharsetRecog_mbcs

+public:

+ virtual ~CharsetRecog_gb_18030();

+ UBool nextChar(IteratedChar* it, InputText* det);

+ const char *getName() const;

+ const char *getLanguage() const;

+ int32_t match(InputText *det);

+};

+U_NAMESPACE_END

+#endif

+#endif /* __CSRMBCS_H */

Property changes on: icu46/source/i18n/csrmbcs.h

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/csrecog.cpp ('k') | icu46/source/i18n/csrmbcs.cpp » ('j') | no next file with comments »