OLD | NEW |
| (Empty) |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H_ | |
6 #define CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H_ | |
7 | |
8 #include <map> | |
9 #include <string> | |
10 | |
11 #include "base/basictypes.h" | |
12 #include "base/string16.h" | |
13 | |
14 #include "unicode/uscript.h" | |
15 | |
16 // A class which handles character attributes dependent on a spellchecker and | |
17 // its dictionary. | |
18 // This class is used by the SpellcheckWordIterator class to determine whether | |
19 // or not a character is one used by the spellchecker and its dictinary. | |
20 class SpellcheckCharAttribute { | |
21 public: | |
22 SpellcheckCharAttribute(); | |
23 | |
24 ~SpellcheckCharAttribute(); | |
25 | |
26 // Sets the default language of the spell checker. This controls which | |
27 // characters are considered parts of words of the given language. | |
28 void SetDefaultLanguage(const std::string& language); | |
29 | |
30 // Returns whether or not the given character is a character used by the | |
31 // selected dictionary. | |
32 // Parameters | |
33 // * character [in] (UChar32) | |
34 // Represents a Unicode character to be checked. | |
35 // Return values | |
36 // * true | |
37 // The given character is a word character. | |
38 // * false | |
39 // The given character is not a word character. | |
40 bool IsWordChar(UChar32 character) const; | |
41 | |
42 // Returns whether or not the given character is a character used by | |
43 // contractions. | |
44 // Parameters | |
45 // * character [in] (UChar32) | |
46 // Represents a Unicode character to be checked. | |
47 // Return values | |
48 // * true | |
49 // The given character is a character used by contractions. | |
50 // * false | |
51 // The given character is not a character used by contractions. | |
52 bool IsContractionChar(UChar32 character) const; | |
53 | |
54 private: | |
55 // Initializes the mapping table. | |
56 void InitializeScriptTable(); | |
57 | |
58 // Retrieves the ICU script code. | |
59 UScriptCode GetScriptCode(UChar32 character) const; | |
60 | |
61 // Updates an entry in the mapping table. | |
62 void SetWordScript(const int script_code, bool in_use); | |
63 | |
64 // Returns whether or not the given script is used by the selected | |
65 // dictionary. | |
66 bool IsWordScript(const UScriptCode script_code) const; | |
67 | |
68 private: | |
69 // Represents a mapping table from a script code to a boolean value | |
70 // representing whether or not the script is used by the selected dictionary. | |
71 bool script_attributes_[USCRIPT_CODE_LIMIT]; | |
72 | |
73 // Represents a table of characters used by contractions. | |
74 std::map<UChar32, bool> middle_letters_; | |
75 | |
76 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); | |
77 }; | |
78 | |
79 // A class which implements methods for finding the location of word boundaries | |
80 // used by the Spellchecker class. | |
81 // This class is implemented on the following assumptions: | |
82 // * An input string is encoded in UTF-16 (i.e. it may contain surrogate | |
83 // pairs), and; | |
84 // * The length of a string is the number of UTF-16 characters in the string | |
85 // (i.e. the length of a non-BMP character becomes two). | |
86 class SpellcheckWordIterator { | |
87 public: | |
88 SpellcheckWordIterator(); | |
89 | |
90 ~SpellcheckWordIterator(); | |
91 | |
92 // Initializes a word-iterator object. | |
93 // Parameters | |
94 // * attribute [in] (const SpellcheckCharAttribute*) | |
95 // Represents a set of character attributes used for filtering out | |
96 // non-word characters. | |
97 // * word [in] (const char16*) | |
98 // Represents a string from which this object extracts words. | |
99 // (This string does not have to be NUL-terminated.) | |
100 // * length [in] (size_t) | |
101 // Represents the length of the given string, in UTF-16 characters. | |
102 // This value should not include terminating NUL characters. | |
103 // * allow_contraction [in] (bool) | |
104 // Represents a flag to control whether or not this object should split a | |
105 // possible contraction (e.g. "isn't", "in'n'out", etc.) | |
106 // Return values | |
107 // * true | |
108 // This word-iterator object is initialized successfully. | |
109 // * false | |
110 // An error occured while initializing this object. | |
111 void Initialize(const SpellcheckCharAttribute* attribute, | |
112 const char16* word, | |
113 size_t length, | |
114 bool allow_contraction); | |
115 | |
116 // Retrieves a word (or a contraction). | |
117 // Parameters | |
118 // * word_string [out] (string16*) | |
119 // Represents a word (or a contraction) to be checked its spelling. | |
120 // This |word_string| has been already normalized to its canonical form | |
121 // (i.e. decomposed ligatures, replaced full-width latin characters to | |
122 // its ASCII alternatives, etc.) so that a SpellChecker object can check | |
123 // its spelling without any additional operations. | |
124 // On the other hand, a substring of the input string | |
125 // string16 str(&word[word_start], word_length); | |
126 // represents the non-normalized version of this extracted word. | |
127 // * word_start [out] (int*) | |
128 // Represents the offset of this word from the beginning of the input | |
129 // string, in UTF-16 characters. | |
130 // * word_length [out] (int*) | |
131 // Represents the length of an extracted word before normalization, in | |
132 // UTF-16 characters. | |
133 // When the input string contains ligatures, this value may not be equal | |
134 // to the length of the |word_string|. | |
135 // Return values | |
136 // * true | |
137 // Found a word (or a contraction) to be checked its spelling. | |
138 // * false | |
139 // Not found any more words or contractions to be checked their spellings. | |
140 bool GetNextWord(string16* word_string, | |
141 int* word_start, | |
142 int* word_length); | |
143 | |
144 private: | |
145 // Retrieves a segment consisting of word characters (and contraction | |
146 // characters if the |allow_contraction| value is true). | |
147 void GetSegment(int* segment_start, | |
148 int* segment_end); | |
149 | |
150 // Discards non-word characters at the beginning and the end of the given | |
151 // segment. | |
152 void TrimSegment(int segment_start, | |
153 int segment_end, | |
154 int* word_start, | |
155 int* word_length) const; | |
156 | |
157 // Normalizes the given segment of the |word_| variable and write its | |
158 // canonical form to the |output_string|. | |
159 bool Normalize(int input_start, | |
160 int input_length, | |
161 string16* output_string) const; | |
162 | |
163 private: | |
164 // The pointer to the input string from which we are extracting words. | |
165 const char16* word_; | |
166 | |
167 // The length of the original string. | |
168 int length_; | |
169 | |
170 // The current position in the original string. | |
171 int position_; | |
172 | |
173 // The flag to control whether or not this object should extract possible | |
174 // contractions. | |
175 bool allow_contraction_; | |
176 | |
177 // The character attributes used for filtering out non-word characters. | |
178 const SpellcheckCharAttribute* attribute_; | |
179 | |
180 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); | |
181 }; | |
182 | |
183 #endif // CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H_ | |
OLD | NEW |