| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #ifndef CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H_ | |
| 6 #define CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H_ | |
| 7 | |
| 8 #include <map> | |
| 9 #include <string> | |
| 10 | |
| 11 #include "base/basictypes.h" | |
| 12 #include "base/string16.h" | |
| 13 | |
| 14 #include "unicode/uscript.h" | |
| 15 | |
| 16 // A class which handles character attributes dependent on a spellchecker and | |
| 17 // its dictionary. | |
| 18 // This class is used by the SpellcheckWordIterator class to determine whether | |
| 19 // or not a character is one used by the spellchecker and its dictinary. | |
| 20 class SpellcheckCharAttribute { | |
| 21 public: | |
| 22 SpellcheckCharAttribute(); | |
| 23 | |
| 24 ~SpellcheckCharAttribute(); | |
| 25 | |
| 26 // Sets the default language of the spell checker. This controls which | |
| 27 // characters are considered parts of words of the given language. | |
| 28 void SetDefaultLanguage(const std::string& language); | |
| 29 | |
| 30 // Returns whether or not the given character is a character used by the | |
| 31 // selected dictionary. | |
| 32 // Parameters | |
| 33 // * character [in] (UChar32) | |
| 34 // Represents a Unicode character to be checked. | |
| 35 // Return values | |
| 36 // * true | |
| 37 // The given character is a word character. | |
| 38 // * false | |
| 39 // The given character is not a word character. | |
| 40 bool IsWordChar(UChar32 character) const; | |
| 41 | |
| 42 // Returns whether or not the given character is a character used by | |
| 43 // contractions. | |
| 44 // Parameters | |
| 45 // * character [in] (UChar32) | |
| 46 // Represents a Unicode character to be checked. | |
| 47 // Return values | |
| 48 // * true | |
| 49 // The given character is a character used by contractions. | |
| 50 // * false | |
| 51 // The given character is not a character used by contractions. | |
| 52 bool IsContractionChar(UChar32 character) const; | |
| 53 | |
| 54 private: | |
| 55 // Initializes the mapping table. | |
| 56 void InitializeScriptTable(); | |
| 57 | |
| 58 // Retrieves the ICU script code. | |
| 59 UScriptCode GetScriptCode(UChar32 character) const; | |
| 60 | |
| 61 // Updates an entry in the mapping table. | |
| 62 void SetWordScript(const int script_code, bool in_use); | |
| 63 | |
| 64 // Returns whether or not the given script is used by the selected | |
| 65 // dictionary. | |
| 66 bool IsWordScript(const UScriptCode script_code) const; | |
| 67 | |
| 68 private: | |
| 69 // Represents a mapping table from a script code to a boolean value | |
| 70 // representing whether or not the script is used by the selected dictionary. | |
| 71 bool script_attributes_[USCRIPT_CODE_LIMIT]; | |
| 72 | |
| 73 // Represents a table of characters used by contractions. | |
| 74 std::map<UChar32, bool> middle_letters_; | |
| 75 | |
| 76 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); | |
| 77 }; | |
| 78 | |
| 79 // A class which implements methods for finding the location of word boundaries | |
| 80 // used by the Spellchecker class. | |
| 81 // This class is implemented on the following assumptions: | |
| 82 // * An input string is encoded in UTF-16 (i.e. it may contain surrogate | |
| 83 // pairs), and; | |
| 84 // * The length of a string is the number of UTF-16 characters in the string | |
| 85 // (i.e. the length of a non-BMP character becomes two). | |
| 86 class SpellcheckWordIterator { | |
| 87 public: | |
| 88 SpellcheckWordIterator(); | |
| 89 | |
| 90 ~SpellcheckWordIterator(); | |
| 91 | |
| 92 // Initializes a word-iterator object. | |
| 93 // Parameters | |
| 94 // * attribute [in] (const SpellcheckCharAttribute*) | |
| 95 // Represents a set of character attributes used for filtering out | |
| 96 // non-word characters. | |
| 97 // * word [in] (const char16*) | |
| 98 // Represents a string from which this object extracts words. | |
| 99 // (This string does not have to be NUL-terminated.) | |
| 100 // * length [in] (size_t) | |
| 101 // Represents the length of the given string, in UTF-16 characters. | |
| 102 // This value should not include terminating NUL characters. | |
| 103 // * allow_contraction [in] (bool) | |
| 104 // Represents a flag to control whether or not this object should split a | |
| 105 // possible contraction (e.g. "isn't", "in'n'out", etc.) | |
| 106 // Return values | |
| 107 // * true | |
| 108 // This word-iterator object is initialized successfully. | |
| 109 // * false | |
| 110 // An error occured while initializing this object. | |
| 111 void Initialize(const SpellcheckCharAttribute* attribute, | |
| 112 const char16* word, | |
| 113 size_t length, | |
| 114 bool allow_contraction); | |
| 115 | |
| 116 // Retrieves a word (or a contraction). | |
| 117 // Parameters | |
| 118 // * word_string [out] (string16*) | |
| 119 // Represents a word (or a contraction) to be checked its spelling. | |
| 120 // This |word_string| has been already normalized to its canonical form | |
| 121 // (i.e. decomposed ligatures, replaced full-width latin characters to | |
| 122 // its ASCII alternatives, etc.) so that a SpellChecker object can check | |
| 123 // its spelling without any additional operations. | |
| 124 // On the other hand, a substring of the input string | |
| 125 // string16 str(&word[word_start], word_length); | |
| 126 // represents the non-normalized version of this extracted word. | |
| 127 // * word_start [out] (int*) | |
| 128 // Represents the offset of this word from the beginning of the input | |
| 129 // string, in UTF-16 characters. | |
| 130 // * word_length [out] (int*) | |
| 131 // Represents the length of an extracted word before normalization, in | |
| 132 // UTF-16 characters. | |
| 133 // When the input string contains ligatures, this value may not be equal | |
| 134 // to the length of the |word_string|. | |
| 135 // Return values | |
| 136 // * true | |
| 137 // Found a word (or a contraction) to be checked its spelling. | |
| 138 // * false | |
| 139 // Not found any more words or contractions to be checked their spellings. | |
| 140 bool GetNextWord(string16* word_string, | |
| 141 int* word_start, | |
| 142 int* word_length); | |
| 143 | |
| 144 private: | |
| 145 // Retrieves a segment consisting of word characters (and contraction | |
| 146 // characters if the |allow_contraction| value is true). | |
| 147 void GetSegment(int* segment_start, | |
| 148 int* segment_end); | |
| 149 | |
| 150 // Discards non-word characters at the beginning and the end of the given | |
| 151 // segment. | |
| 152 void TrimSegment(int segment_start, | |
| 153 int segment_end, | |
| 154 int* word_start, | |
| 155 int* word_length) const; | |
| 156 | |
| 157 // Normalizes the given segment of the |word_| variable and write its | |
| 158 // canonical form to the |output_string|. | |
| 159 bool Normalize(int input_start, | |
| 160 int input_length, | |
| 161 string16* output_string) const; | |
| 162 | |
| 163 private: | |
| 164 // The pointer to the input string from which we are extracting words. | |
| 165 const char16* word_; | |
| 166 | |
| 167 // The length of the original string. | |
| 168 int length_; | |
| 169 | |
| 170 // The current position in the original string. | |
| 171 int position_; | |
| 172 | |
| 173 // The flag to control whether or not this object should extract possible | |
| 174 // contractions. | |
| 175 bool allow_contraction_; | |
| 176 | |
| 177 // The character attributes used for filtering out non-word characters. | |
| 178 const SpellcheckCharAttribute* attribute_; | |
| 179 | |
| 180 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); | |
| 181 }; | |
| 182 | |
| 183 #endif // CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H_ | |
| OLD | NEW |