Index: chrome/renderer/spellchecker/spellcheck_worditerator.h |
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chrome/renderer/spellchecker/spellcheck_worditerator.h |
deleted file mode 100644 |
index be40e980dab20436916356b949a2d4496f1b3766..0000000000000000000000000000000000000000 |
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.h |
+++ /dev/null |
@@ -1,200 +0,0 @@ |
-// Copyright (c) 2011 The Chromium Authors. All rights reserved. |
-// Use of this source code is governed by a BSD-style license that can be |
-// found in the LICENSE file. |
- |
-// Defines an iterator class that enumerates words supported by our spellchecker |
-// from multi-language text. This class is used for filtering out characters |
-// not supported by our spellchecker. |
- |
-#ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
-#define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
- |
-#include <stddef.h> |
- |
-#include <memory> |
-#include <string> |
- |
-#include "base/macros.h" |
-#include "base/strings/string16.h" |
-#include "third_party/icu/source/common/unicode/uscript.h" |
- |
-namespace base { |
-namespace i18n { |
-class BreakIterator; |
-} // namespace i18n |
-} // namespace base |
- |
-// A class which encapsulates language-specific operations used by |
-// SpellcheckWordIterator. When we set the spellchecker language, this class |
-// creates rule sets that filter out the characters not supported by the |
-// spellchecker. (Please read the comment in the SpellcheckWordIterator class |
-// about how to use this class.) |
-class SpellcheckCharAttribute { |
- public: |
- SpellcheckCharAttribute(); |
- ~SpellcheckCharAttribute(); |
- |
- // Sets the language of the spellchecker. When this function is called with an |
- // ISO language code, this function creates the custom rule-sets used by |
- // the ICU break iterator so it can extract only words used by the language. |
- // GetRuleSet() returns the rule-sets created in this function. |
- void SetDefaultLanguage(const std::string& language); |
- |
- // Returns a custom rule-set string used by the ICU break iterator. This class |
- // has two rule-sets, one splits a contraction and the other does not, so we |
- // can split a concaticated word (e.g. "seven-year-old") into words (e.g. |
- // "seven", "year", and "old") and check their spellings. The result stirng is |
- // encoded in UTF-16 since ICU needs UTF-16 strings. |
- base::string16 GetRuleSet(bool allow_contraction) const; |
- |
- // Outputs a character only if it is a word character. (Please read the |
- // comments in CreateRuleSets() why we need this function.) |
- bool OutputChar(UChar c, base::string16* output) const; |
- |
- private: |
- // Creates the rule-sets that return words possibly used by the given |
- // language. Unfortunately, these rule-sets are not perfect and have some |
- // false-positives. For example, they return combined accent marks even though |
- // we need English words only. We call OutputCharacter() to filter out such |
- // false-positive characters. |
- void CreateRuleSets(const std::string& language); |
- |
- // Outputs a character only if it is one used by the given language. These |
- // functions are called from OutputChar(). |
- bool OutputArabic(UChar c, base::string16* output) const; |
- bool OutputHangul(UChar c, base::string16* output) const; |
- bool OutputHebrew(UChar c, base::string16* output) const; |
- bool OutputDefault(UChar c, base::string16* output) const; |
- |
- // The custom rule-set strings used by ICU break iterator. Since it is not so |
- // easy to create custom rule-sets from an ISO language code, this class |
- // saves these rule-set strings created when we set the language. |
- base::string16 ruleset_allow_contraction_; |
- base::string16 ruleset_disallow_contraction_; |
- |
- // The script code used by this language. |
- UScriptCode script_code_; |
- |
- DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); |
-}; |
- |
-// A class which extracts words that can be checked for spelling from a |
-// multi-language string. The ICU word-break iterator does not discard some |
-// punctuation characters attached to a word. For example, when we set a word |
-// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does |
-// it discard characters not used by the language. For example, it returns |
-// Russian words even though we need English words only. To extract only the |
-// words that our spellchecker can check their spellings, this class uses custom |
-// rule-sets created by the SpellcheckCharAttribute class. Also, this class |
-// normalizes extracted words so our spellchecker can check the spellings of |
-// words that include ligatures, combined characters, full-width characters, |
-// etc. This class uses UTF-16 strings as its input and output strings since |
-// UTF-16 is the native encoding of ICU and avoid unnecessary conversions |
-// when changing the encoding of this string for our spellchecker. (Chrome can |
-// use two or more spellcheckers and we cannot assume their encodings.) |
-// The following snippet is an example that extracts words with this class. |
-// |
-// // Creates the language-specific attributes for US English. |
-// SpellcheckCharAttribute attribute; |
-// attribute.SetDefaultLanguage("en-US"); |
-// |
-// // Set up a SpellcheckWordIterator object which extracts English words, |
-// // and retrieve them. |
-// SpellcheckWordIterator iterator; |
-// base::string16 text(base::UTF8ToUTF16("this is a test.")); |
-// iterator.Initialize(&attribute, true); |
-// iterator.SetText(text.c_str(), text_.length()); |
-// |
-// base::string16 word; |
-// int offset; |
-// int length; |
-// while (iterator.GetNextWord(&word, &offset, &length)) { |
-// ... |
-// } |
-// |
-class SpellcheckWordIterator { |
- public: |
- enum WordIteratorStatus { |
- // The end of a sequence of text that the iterator recognizes as characters |
- // that can form a word. |
- IS_WORD, |
- // Non-word characters that the iterator can skip past, such as punctuation, |
- // whitespace, and characters from another character set. |
- IS_SKIPPABLE, |
- // The end of the text that the iterator is going over. |
- IS_END_OF_TEXT |
- }; |
- |
- SpellcheckWordIterator(); |
- ~SpellcheckWordIterator(); |
- |
- // Initializes a word-iterator object with the language-specific attribute. If |
- // we need to split contractions and concatenated words, call this function |
- // with its 'allow_contraction' parameter false. (This function uses lots of |
- // temporal memory to compile a custom word-break rule into an automaton.) |
- bool Initialize(const SpellcheckCharAttribute* attribute, |
- bool allow_contraction); |
- |
- // Returns whether this word iterator is initialized. |
- bool IsInitialized() const; |
- |
- // Set text to be iterated. (This text does not have to be NULL-terminated.) |
- // This function also resets internal state so we can reuse this iterator |
- // without calling Initialize(). |
- bool SetText(const base::char16* text, size_t length); |
- |
- // Advances |iterator_| through |text_| and gets the current status of the |
- // word iterator within |text|: |
- // |
- // - Returns IS_WORD if the iterator just found the end of a sequence of word |
- // characters and it was able to normalize the sequence. This stores the |
- // normalized string into |word_string| and stores the position and length |
- // into |word_start| and |word_length| respectively. Keep in mind that |
- // since this function normalizes the output word, the length of |
- // |word_string| may be different from the |word_length|. Therefore, when |
- // we call functions that change the input text, such as |
- // string16::replace(), we need to use |word_start| and |word_length| as |
- // listed in the following snippet: |
- // |
- // while(iterator.GetNextWord(&word, &offset, &length)) |
- // text.replace(offset, length, word); |
- // |
- // - Returns IS_SKIPPABLE if the iterator just found a character that the |
- // iterator can skip past such as punctuation, whitespace, and characters |
- // from another character set. This stores the character, position, and |
- // length into |word_string|, |word_start|, and |word_length| respectively. |
- // |
- // - Returns IS_END_OF_TEXT if the iterator has reached the end of |text_|. |
- SpellcheckWordIterator::WordIteratorStatus |
- GetNextWord(base::string16* word_string, int* word_start, int* word_length); |
- |
- // Releases all the resources attached to this object. |
- void Reset(); |
- |
- private: |
- // Normalizes a non-terminated string returned from an ICU word-break |
- // iterator. A word returned from an ICU break iterator may include characters |
- // not supported by our spellchecker, e.g. ligatures, combining/ characters, |
- // full-width letters, etc. This function replaces such characters with |
- // alternative characters supported by our spellchecker. This function also |
- // calls SpellcheckWordIterator::OutputChar() to filter out false-positive |
- // characters. |
- bool Normalize(int input_start, |
- int input_length, |
- base::string16* output_string) const; |
- |
- // The pointer to the input string from which we are extracting words. |
- const base::char16* text_; |
- |
- // The language-specific attributes used for filtering out non-word |
- // characters. |
- const SpellcheckCharAttribute* attribute_; |
- |
- // The break iterator. |
- std::unique_ptr<base::i18n::BreakIterator> iterator_; |
- |
- DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); |
-}; |
- |
-#endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
- |