OLD | NEW |
(Empty) | |
| 1 // |
| 2 // rbbiscan.h |
| 3 // |
| 4 // Copyright (C) 2002-2008, International Business Machines Corporation and oth
ers. |
| 5 // All Rights Reserved. |
| 6 // |
| 7 // This file contains declarations for class RBBIRuleScanner |
| 8 // |
| 9 |
| 10 |
| 11 #ifndef RBBISCAN_H |
| 12 #define RBBISCAN_H |
| 13 |
| 14 #include "unicode/utypes.h" |
| 15 #include "unicode/uobject.h" |
| 16 #include "unicode/rbbi.h" |
| 17 #include "unicode/uniset.h" |
| 18 #include "unicode/parseerr.h" |
| 19 #include "uhash.h" |
| 20 #include "uvector.h" |
| 21 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
| 22 // looks up references to $variables within a set. |
| 23 #include "rbbinode.h" |
| 24 //#include "rbbitblb.h" |
| 25 |
| 26 |
| 27 |
| 28 U_NAMESPACE_BEGIN |
| 29 |
| 30 class RBBIRuleBuilder; |
| 31 class RBBISymbolTable; |
| 32 |
| 33 |
| 34 //------------------------------------------------------------------------------
-- |
| 35 // |
| 36 // class RBBIRuleScanner does the lowest level, character-at-a-time |
| 37 // scanning of break iterator rules. |
| 38 // |
| 39 // The output of the scanner is parse trees for |
| 40 // the rule expressions and a list of all Unicode Sets |
| 41 // encountered. |
| 42 // |
| 43 //------------------------------------------------------------------------------
-- |
| 44 |
| 45 class RBBIRuleScanner : public UMemory { |
| 46 public: |
| 47 |
| 48 enum { |
| 49 kStackSize = 100 // The size of the state stack for |
| 50 }; // rules parsing. Corresponds roughly |
| 51 // to the depth of parentheses nesting |
| 52 // that is allowed in the rules. |
| 53 |
| 54 struct RBBIRuleChar { |
| 55 UChar32 fChar; |
| 56 UBool fEscaped; |
| 57 }; |
| 58 |
| 59 RBBIRuleScanner(RBBIRuleBuilder *rb); |
| 60 |
| 61 |
| 62 virtual ~RBBIRuleScanner(); |
| 63 |
| 64 void nextChar(RBBIRuleChar &c); // Get the next char from th
e input stream. |
| 65 // Return false if at end. |
| 66 |
| 67 UBool push(const RBBIRuleChar &c); // Push (unget) one characte
r. |
| 68 // Only a single character
may be pushed. |
| 69 |
| 70 void parse(); // Parse the rules, generati
ng two parse |
| 71 // trees, one each for the
forward and |
| 72 // reverse rules, |
| 73 // and a list of UnicodeSe
ts encountered. |
| 74 |
| 75 /** |
| 76 * Return a rules string without unnecessary |
| 77 * characters. |
| 78 */ |
| 79 static UnicodeString stripRules(const UnicodeString &rules); |
| 80 private: |
| 81 |
| 82 UBool doParseActions(int32_t a); |
| 83 void error(UErrorCode e); // error reporting conven
ience function. |
| 84 void fixOpStack(RBBINode::OpPrecedence p); |
| 85 // a character. |
| 86 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *s
etToAdopt = NULL); |
| 87 |
| 88 UChar32 nextCharLL(); |
| 89 #ifdef RBBI_DEBUG |
| 90 void printNodeStack(const char *title); |
| 91 #endif |
| 92 RBBINode *pushNewNode(RBBINode::NodeType t); |
| 93 void scanSet(); |
| 94 |
| 95 |
| 96 RBBIRuleBuilder *fRB; // The rule builder that we
are part of. |
| 97 |
| 98 int32_t fScanIndex; // Index of current charact
er being processed |
| 99 // in the rule input stri
ng. |
| 100 int32_t fNextIndex; // Index of the next charac
ter, which |
| 101 // is the first character
not yet scanned. |
| 102 UBool fQuoteMode; // Scan is in a 'quoted reg
ion' |
| 103 int32_t fLineNum; // Line number in input fil
e. |
| 104 int32_t fCharNum; // Char position within the
line. |
| 105 UChar32 fLastChar; // Previous char, needed to
count CR-LF |
| 106 // as a single line, not
two. |
| 107 |
| 108 RBBIRuleChar fC; // Current char for parse s
tate machine |
| 109 // processing. |
| 110 UnicodeString fVarName; // $variableName, valid whe
n we've just |
| 111 // scanned one. |
| 112 |
| 113 RBBIRuleTableEl **fStateTable; // State Transition Table f
or RBBI Rule |
| 114 // parsing. index by p[s
tate][char-class] |
| 115 |
| 116 uint16_t fStack[kStackSize]; // State stack, holds sta
te pushes |
| 117 int32_t fStackPtr; // and pops as specified
in the state |
| 118 // transition rules. |
| 119 |
| 120 RBBINode *fNodeStack[kStackSize]; // Node stack, holds
nodes created |
| 121 // during the parse
of a rule |
| 122 int32_t fNodeStackPtr; |
| 123 |
| 124 |
| 125 UBool fReverseRule; // True if the rule current
ly being scanned |
| 126 // is a reverse direction
rule (if it |
| 127 // starts with a '!') |
| 128 |
| 129 UBool fLookAheadRule; // True if the rule include
s a '/' |
| 130 // somewhere within it. |
| 131 |
| 132 RBBISymbolTable *fSymbolTable; // symbol table, holds defi
nitions of |
| 133 // $variable symbols. |
| 134 |
| 135 UHashtable *fSetTable; // UnicocodeSet hash table,
holds indexes to |
| 136 // the sets created while
parsing rules. |
| 137 // The key is the string
used for creating |
| 138 // the set. |
| 139 |
| 140 UnicodeSet fRuleSets[10]; // Unicode Sets that are ne
eded during |
| 141 // the scanning of RBBI ru
les. The |
| 142 // indicies for these are
assigned by the |
| 143 // perl script that builds
the state tables. |
| 144 // See rbbirpt.h. |
| 145 |
| 146 int32_t fRuleNum; // Counts each rule as it i
s scanned. |
| 147 |
| 148 int32_t fOptionStart; // Input index of start of
a !!option |
| 149 // keyword, while being s
canned. |
| 150 |
| 151 UnicodeSet *gRuleSet_rule_char; |
| 152 UnicodeSet *gRuleSet_white_space; |
| 153 UnicodeSet *gRuleSet_name_char; |
| 154 UnicodeSet *gRuleSet_name_start_char; |
| 155 |
| 156 RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this cla
ss |
| 157 RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying
of this class |
| 158 }; |
| 159 |
| 160 U_NAMESPACE_END |
| 161 |
| 162 #endif |
OLD | NEW |