icu46/source/i18n/csrmbcs.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/csrmbcs.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 */

	7

	8 #ifndef __CSRMBCS_H

	9 #define __CSRMBCS_H

	10

	11 #include "unicode/utypes.h"

	12

	13 #if !UCONFIG_NO_CONVERSION

	14

	15 #include "csrecog.h"

	16

	17 U_NAMESPACE_BEGIN

	18

	19 // "Character" iterated character class.

	20 // Recognizers for specific mbcs encodings make their "characters" available

	21 // by providing a nextChar() function that fills in an instance of IteratedCh ar

	22 // with the next char from the input.

	23 // The returned characters are not converted to Unicode, but remain as the ra w

	24 // bytes (concatenated into an int) from the codepage data.

	25 //

	26 // For Asian charsets, use the raw input rather than the input that has been

	27 // stripped of markup. Detection only considers multi-byte chars, effectively

	28 // stripping markup anyway, and double byte chars do occur in markup too.

	29 //

	30 class IteratedChar : public UMemory

	31 {

	32 public:

	33 uint32_t charValue; // 1-4 bytes from the raw input data

	34 int32_t index;

	35 int32_t nextIndex;

	36 UBool error;

	37 UBool done;

	38

	39 public:

	40 IteratedChar();

	41 //void reset();

	42 int32_t nextByte(InputText* det);

	43 };

	44

	45

	46 class CharsetRecog_mbcs : public CharsetRecognizer {

	47

	48 protected:

	49 /**

	50 * Test the match of this charset with the input text data

	51 * which is obtained via the CharsetDetector object.

	52 *

	53 * @param det The CharsetDetector, which contains the input text

	54 * to be checked for being in this charset.

	55 * @return Two values packed into one int (Damn java, anyhow)

	56 * <br/>

	57 * bits 0-7: the match confidence, ranging from 0-100

	58 * <br/>

	59 * bits 8-15: The match reason, an enum-like value.

	60 */

	61 int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t com monCharsLen);

	62

	63 public:

	64

	65 virtual ~CharsetRecog_mbcs();

	66

	67 /**

	68 * Get the IANA name of this charset.

	69 * @return the charset name.

	70 */

	71

	72 const char *getName() const = 0;

	73 const char *getLanguage() const = 0;

	74 int32_t match(InputText* det) = 0;

	75

	76 /**

	77 * Get the next character (however many bytes it is) from the input data

	78 * Subclasses for specific charset encodings must implement this function

	79 * to get characters according to the rules of their encoding scheme.

	80 *

	81 * This function is not a method of class IteratedChar only because

	82 * that would require a lot of extra derived classes, which is awkward.

	83 * @param it The IteratedChar "struct" into which the returned char is plac ed.

	84 * @param det The charset detector, which is needed to get at the input byte data

	85 * being iterated over.

	86 * @return True if a character was returned, false at end of input.

	87 */

	88 virtual UBool nextChar(IteratedChar it, InputText textIn) = 0;

	89

	90 };

	91

	92

	93 /**

	94 * Shift-JIS charset recognizer.

	95 *

	96 */

	97 class CharsetRecog_sjis : public CharsetRecog_mbcs {

	98 public:

	99 virtual ~CharsetRecog_sjis();

	100

	101 UBool nextChar(IteratedChar it, InputText det);

	102

	103 int32_t match(InputText *det);

	104

	105 const char *getName() const;

	106 const char *getLanguage() const;

	107

	108 };

	109

	110

	111 /**

	112 * EUC charset recognizers. One abstract class that provides the common funct ion

	113 * for getting the next character according to the EUC encoding sche me,

	114 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.

	115 *

	116 */

	117 class CharsetRecog_euc : public CharsetRecog_mbcs

	118 {

	119 public:

	120 virtual ~CharsetRecog_euc();

	121

	122 const char *getName() const = 0;

	123 const char *getLanguage() const = 0;

	124

	125 int32_t match(InputText* det) = 0;

	126 /*

	127 * (non-Javadoc)

	128 * Get the next character value for EUC based encodings.

	129 * Character "value" is simply the raw bytes that make up the character

	130 * packed into an int.

	131 */

	132 UBool nextChar(IteratedChar it, InputText det);

	133 };

	134

	135 /**

	136 * The charset recognize for EUC-JP. A singleton instance of this class

	137 * is created and kept by the public CharsetDetector class

	138 */

	139 class CharsetRecog_euc_jp : public CharsetRecog_euc

	140 {

	141 public:

	142 virtual ~CharsetRecog_euc_jp();

	143

	144 const char *getName() const;

	145 const char *getLanguage() const;

	146

	147 int32_t match(InputText *det);

	148 };

	149

	150 /**

	151 * The charset recognize for EUC-KR. A singleton instance of this class

	152 * is created and kept by the public CharsetDetector class

	153 */

	154 class CharsetRecog_euc_kr : public CharsetRecog_euc

	155 {

	156 public:

	157 virtual ~CharsetRecog_euc_kr();

	158

	159 const char *getName() const;

	160 const char *getLanguage() const;

	161

	162 int32_t match(InputText *det);

	163 };

	164

	165 /**

	166 *

	167 * Big5 charset recognizer.

	168 *

	169 */

	170 class CharsetRecog_big5 : public CharsetRecog_mbcs

	171 {

	172 public:

	173 virtual ~CharsetRecog_big5();

	174

	175 UBool nextChar(IteratedChar* it, InputText* det);

	176

	177 const char *getName() const;

	178 const char *getLanguage() const;

	179

	180 int32_t match(InputText *det);

	181 };

	182

	183

	184 /**

	185 *

	186 * GB-18030 recognizer. Uses simplified Chinese statistics.

	187 *

	188 */

	189 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs

	190 {

	191 public:

	192 virtual ~CharsetRecog_gb_18030();

	193

	194 UBool nextChar(IteratedChar* it, InputText* det);

	195

	196 const char *getName() const;

	197 const char *getLanguage() const;

	198

	199 int32_t match(InputText *det);

	200 };

	201

	202 U_NAMESPACE_END

	203

	204 #endif

	205 #endif /* __CSRMBCS_H */

OLD	NEW

« no previous file with comments | « icu46/source/i18n/csrecog.cpp ('k') | icu46/source/i18n/csrmbcs.cpp » ('j') | no next file with comments »