OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2013-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationruleparser.h |
| 7 * |
| 8 * created on: 2013apr10 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #ifndef __COLLATIONRULEPARSER_H__ |
| 13 #define __COLLATIONRULEPARSER_H__ |
| 14 |
| 15 #include "unicode/utypes.h" |
| 16 |
| 17 #if !UCONFIG_NO_COLLATION |
| 18 |
| 19 #include "unicode/ucol.h" |
| 20 #include "unicode/uniset.h" |
| 21 #include "unicode/unistr.h" |
| 22 |
| 23 struct UParseError; |
| 24 |
| 25 U_NAMESPACE_BEGIN |
| 26 |
| 27 struct CollationData; |
| 28 struct CollationTailoring; |
| 29 |
| 30 class Locale; |
| 31 class Normalizer2; |
| 32 |
| 33 struct CollationSettings; |
| 34 |
| 35 class U_I18N_API CollationRuleParser : public UMemory { |
| 36 public: |
| 37 /** Special reset positions. */ |
| 38 enum Position { |
| 39 FIRST_TERTIARY_IGNORABLE, |
| 40 LAST_TERTIARY_IGNORABLE, |
| 41 FIRST_SECONDARY_IGNORABLE, |
| 42 LAST_SECONDARY_IGNORABLE, |
| 43 FIRST_PRIMARY_IGNORABLE, |
| 44 LAST_PRIMARY_IGNORABLE, |
| 45 FIRST_VARIABLE, |
| 46 LAST_VARIABLE, |
| 47 FIRST_REGULAR, |
| 48 LAST_REGULAR, |
| 49 FIRST_IMPLICIT, |
| 50 LAST_IMPLICIT, |
| 51 FIRST_TRAILING, |
| 52 LAST_TRAILING |
| 53 }; |
| 54 |
| 55 /** |
| 56 * First character of contractions that encode special reset positions. |
| 57 * U+FFFE cannot be tailored via rule syntax. |
| 58 * |
| 59 * The second contraction character is POS_BASE + Position. |
| 60 */ |
| 61 static const UChar POS_LEAD = 0xfffe; |
| 62 /** |
| 63 * Base for the second character of contractions that encode special reset p
ositions. |
| 64 * Braille characters U+28xx are printable and normalization-inert. |
| 65 * @see POS_LEAD |
| 66 */ |
| 67 static const UChar POS_BASE = 0x2800; |
| 68 |
| 69 class U_I18N_API Sink : public UObject { |
| 70 public: |
| 71 virtual ~Sink(); |
| 72 /** |
| 73 * Adds a reset. |
| 74 * strength=UCOL_IDENTICAL for &str. |
| 75 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str
where n=1/2/3. |
| 76 */ |
| 77 virtual void addReset(int32_t strength, const UnicodeString &str, |
| 78 const char *&errorReason, UErrorCode &errorCode) =
0; |
| 79 /** |
| 80 * Adds a relation with strength and prefix | str / extension. |
| 81 */ |
| 82 virtual void addRelation(int32_t strength, const UnicodeString &prefix, |
| 83 const UnicodeString &str, const UnicodeString &
extension, |
| 84 const char *&errorReason, UErrorCode &errorCode
) = 0; |
| 85 |
| 86 virtual void suppressContractions(const UnicodeSet &set, const char *&er
rorReason, |
| 87 UErrorCode &errorCode); |
| 88 |
| 89 virtual void optimize(const UnicodeSet &set, const char *&errorReason, |
| 90 UErrorCode &errorCode); |
| 91 }; |
| 92 |
| 93 class U_I18N_API Importer : public UObject { |
| 94 public: |
| 95 virtual ~Importer(); |
| 96 virtual void getRules( |
| 97 const char *localeID, const char *collationType, |
| 98 UnicodeString &rules, |
| 99 const char *&errorReason, UErrorCode &errorCode) = 0; |
| 100 }; |
| 101 |
| 102 /** |
| 103 * Constructor. |
| 104 * The Sink must be set before parsing. |
| 105 * The Importer can be set, otherwise [import locale] syntax is not supporte
d. |
| 106 */ |
| 107 CollationRuleParser(const CollationData *base, UErrorCode &errorCode); |
| 108 ~CollationRuleParser(); |
| 109 |
| 110 /** |
| 111 * Sets the pointer to a Sink object. |
| 112 * The pointer is aliased: Pointer copy without cloning or taking ownership. |
| 113 */ |
| 114 void setSink(Sink *sinkAlias) { |
| 115 sink = sinkAlias; |
| 116 } |
| 117 |
| 118 /** |
| 119 * Sets the pointer to an Importer object. |
| 120 * The pointer is aliased: Pointer copy without cloning or taking ownership. |
| 121 */ |
| 122 void setImporter(Importer *importerAlias) { |
| 123 importer = importerAlias; |
| 124 } |
| 125 |
| 126 void parse(const UnicodeString &ruleString, |
| 127 CollationSettings &outSettings, |
| 128 UParseError *outParseError, |
| 129 UErrorCode &errorCode); |
| 130 |
| 131 const char *getErrorReason() const { return errorReason; } |
| 132 |
| 133 /** |
| 134 * Gets a script or reorder code from its string representation. |
| 135 * @return the script/reorder code, or |
| 136 * -1 if not recognized |
| 137 */ |
| 138 static int32_t getReorderCode(const char *word); |
| 139 |
| 140 private: |
| 141 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ |
| 142 static const int32_t STRENGTH_MASK = 0xf; |
| 143 static const int32_t STARRED_FLAG = 0x10; |
| 144 static const int32_t OFFSET_SHIFT = 8; |
| 145 |
| 146 void parse(const UnicodeString &ruleString, UErrorCode &errorCode); |
| 147 void parseRuleChain(UErrorCode &errorCode); |
| 148 int32_t parseResetAndPosition(UErrorCode &errorCode); |
| 149 int32_t parseRelationOperator(UErrorCode &errorCode); |
| 150 void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode
); |
| 151 void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCo
de); |
| 152 int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &erro
rCode); |
| 153 int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); |
| 154 |
| 155 /** |
| 156 * Sets str to a contraction of U+FFFE and (U+2800 + Position). |
| 157 * @return rule index after the special reset position |
| 158 */ |
| 159 int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &erro
rCode); |
| 160 void parseSetting(UErrorCode &errorCode); |
| 161 void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); |
| 162 static UColAttributeValue getOnOffValue(const UnicodeString &s); |
| 163 |
| 164 int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); |
| 165 int32_t readWords(int32_t i, UnicodeString &raw) const; |
| 166 int32_t skipComment(int32_t i) const; |
| 167 |
| 168 void setParseError(const char *reason, UErrorCode &errorCode); |
| 169 void setErrorContext(); |
| 170 |
| 171 /** |
| 172 * ASCII [:P:] and [:S:]: |
| 173 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] |
| 174 */ |
| 175 static UBool isSyntaxChar(UChar32 c); |
| 176 int32_t skipWhiteSpace(int32_t i) const; |
| 177 |
| 178 const Normalizer2 &nfd, &nfc; |
| 179 |
| 180 const UnicodeString *rules; |
| 181 const CollationData *const baseData; |
| 182 CollationSettings *settings; |
| 183 UParseError *parseError; |
| 184 const char *errorReason; |
| 185 |
| 186 Sink *sink; |
| 187 Importer *importer; |
| 188 |
| 189 int32_t ruleIndex; |
| 190 }; |
| 191 |
| 192 U_NAMESPACE_END |
| 193 |
| 194 #endif // !UCONFIG_NO_COLLATION |
| 195 #endif // __COLLATIONRULEPARSER_H__ |
OLD | NEW |