| Index: chrome/browser/spellcheck_worditerator.cc
|
| ===================================================================
|
| --- chrome/browser/spellcheck_worditerator.cc (revision 32394)
|
| +++ chrome/browser/spellcheck_worditerator.cc (working copy)
|
| @@ -1,274 +0,0 @@
|
| -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
|
| -// Use of this source code is governed by a BSD-style license that can be
|
| -// found in the LICENSE file.
|
| -
|
| -#include "chrome/browser/spellcheck_worditerator.h"
|
| -
|
| -#include <map>
|
| -#include <string>
|
| -
|
| -#include "base/basictypes.h"
|
| -#include "base/string_util.h"
|
| -#include "chrome/browser/spellchecker.h"
|
| -
|
| -#include "third_party/icu/public/common/unicode/normlzr.h"
|
| -#include "third_party/icu/public/common/unicode/schriter.h"
|
| -#include "third_party/icu/public/common/unicode/uchar.h"
|
| -#include "third_party/icu/public/common/unicode/uscript.h"
|
| -#include "third_party/icu/public/common/unicode/uset.h"
|
| -#include "third_party/icu/public/i18n/unicode/ulocdata.h"
|
| -
|
| -SpellcheckCharAttribute::SpellcheckCharAttribute() {
|
| - InitializeScriptTable();
|
| -
|
| - // Even though many dictionaries treats numbers and contractions as words and
|
| - // treats USCRIPT_COMMON characters as word characters, the
|
| - // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
|
| - // characters to strictly-distinguish contraction characters from word
|
| - // characters.
|
| - SetWordScript(USCRIPT_COMMON, false);
|
| -
|
| - // Initialize the table of characters used for contractions.
|
| - // This array consists of the 'Midletter' and 'MidNumLet' characters of the
|
| - // word-break property list provided by Unicode, Inc.:
|
| - // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
|
| - static const UChar32 kMidLetters[] = {
|
| - L'\x003A', // MidLetter # COLON
|
| - L'\x00B7', // MidLetter # MIDDLE DOT
|
| - L'\x0387', // MidLetter # GREEK ANO TELEIA
|
| - L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM
|
| - L'\x2027', // MidLetter # HYPHENATION POINT
|
| - L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
|
| - L'\xFE55', // MidLetter # SMALL COLON
|
| - L'\xFF1A', // MidLetter # FULLWIDTH COLON
|
| - L'\x0027', // MidNumLet # APOSTROPHE
|
| - L'\x002E', // MidNumLet # FULL STOP
|
| - L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK
|
| - L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK
|
| - L'\x2024', // MidNumLet # ONE DOT LEADER
|
| - L'\xFE52', // MidNumLet # SMALL FULL STOP
|
| - L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE
|
| - L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP
|
| - };
|
| - for (size_t i = 0; i < arraysize(kMidLetters); ++i)
|
| - middle_letters_[kMidLetters[i]] = true;
|
| -}
|
| -
|
| -SpellcheckCharAttribute::~SpellcheckCharAttribute() {
|
| -}
|
| -
|
| -// Sets the default language for this object.
|
| -// This function retrieves the exemplar set to set up the default character
|
| -// attributes.
|
| -void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
|
| - UErrorCode status = U_ZERO_ERROR;
|
| - ULocaleData* locale_data = ulocdata_open(language.c_str(), &status);
|
| - if (U_FAILURE(status))
|
| - return;
|
| -
|
| - // Retrieves the exemplar set of the given language and update the
|
| - // character-attribute table to treat its characters as word characters.
|
| - USet* exemplar_set = uset_open(1, 0);
|
| - ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
|
| - &status);
|
| - ulocdata_close(locale_data);
|
| - if (U_SUCCESS(status)) {
|
| - int length = uset_size(exemplar_set);
|
| - for (int i = 0; i < length; ++i) {
|
| - UChar32 character = uset_charAt(exemplar_set, i);
|
| - SetWordScript(GetScriptCode(character), true);
|
| - }
|
| -
|
| - // Many languages use combining characters to input their characters from
|
| - // keyboards. On the other hand, this exemplar set does not always include
|
| - // combining characters for such languages.
|
| - // To treat such combining characters as word characters, we decompose
|
| - // this exemplar set and treat the decomposed characters as word characters.
|
| - icu::UnicodeString composed;
|
| - for (int i = 0; i < length; ++i)
|
| - composed.append(uset_charAt(exemplar_set, i));
|
| -
|
| - icu::UnicodeString decomposed;
|
| - icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status);
|
| - if (U_SUCCESS(status)) {
|
| - icu::StringCharacterIterator iterator(decomposed);
|
| - UChar32 character = iterator.first32();
|
| - while (character != icu::CharacterIterator::DONE) {
|
| - SetWordScript(GetScriptCode(character), true);
|
| - character = iterator.next32();
|
| - }
|
| - }
|
| - }
|
| - uset_close(exemplar_set);
|
| -}
|
| -
|
| -// Returns whether or not the given character is a character used by the
|
| -// selected dictionary.
|
| -bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
|
| - return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
|
| -}
|
| -
|
| -// Returns whether or not the given character is a character used by
|
| -// contractions.
|
| -bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
|
| - std::map<UChar32, bool>::const_iterator iterator;
|
| - iterator = middle_letters_.find(character);
|
| - if (iterator == middle_letters_.end())
|
| - return false;
|
| - return iterator->second;
|
| -}
|
| -
|
| -// Initializes the mapping table.
|
| -void SpellcheckCharAttribute::InitializeScriptTable() {
|
| - for (size_t i = 0; i < arraysize(script_attributes_); ++i)
|
| - script_attributes_[i] = false;
|
| -}
|
| -
|
| -// Retrieves the ICU script code.
|
| -UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
|
| - UErrorCode status = U_ZERO_ERROR;
|
| - UScriptCode script_code = uscript_getScript(character, &status);
|
| - return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
|
| -}
|
| -
|
| -// Updates the mapping table from an ICU script code to its attribute, i.e.
|
| -// whether not a script is used by the selected dictionary.
|
| -void SpellcheckCharAttribute::SetWordScript(const int script_code,
|
| - bool in_use) {
|
| - if (script_code < 0 ||
|
| - static_cast<size_t>(script_code) >= arraysize(script_attributes_))
|
| - return;
|
| - script_attributes_[script_code] = in_use;
|
| -}
|
| -
|
| -// Returns whether or not the given script is used by the selected
|
| -// dictionary.
|
| -bool SpellcheckCharAttribute::IsWordScript(
|
| - const UScriptCode script_code) const {
|
| - if (script_code < 0 ||
|
| - static_cast<size_t>(script_code) >= arraysize(script_attributes_))
|
| - return false;
|
| - return script_attributes_[script_code];
|
| -}
|
| -
|
| -SpellcheckWordIterator::SpellcheckWordIterator()
|
| - : word_(NULL),
|
| - length_(0),
|
| - position_(0),
|
| - allow_contraction_(false),
|
| - attribute_(NULL) {
|
| -}
|
| -
|
| -SpellcheckWordIterator::~SpellcheckWordIterator() {
|
| -}
|
| -
|
| -// Initialize a word-iterator object.
|
| -void SpellcheckWordIterator::Initialize(
|
| - const SpellcheckCharAttribute* attribute,
|
| - const char16* word,
|
| - size_t length,
|
| - bool allow_contraction) {
|
| - word_ = word;
|
| - position_ = 0;
|
| - length_ = static_cast<int>(length);
|
| - allow_contraction_ = allow_contraction;
|
| - attribute_ = attribute;
|
| -}
|
| -
|
| -// Retrieves a word (or a contraction).
|
| -// When a contraction is enclosed with contraction characters (e.g. 'isn't',
|
| -// 'rock'n'roll'), we should discard the beginning and the end of the
|
| -// contraction but we should never split the contraction.
|
| -// To handle this case easily, we should firstly extract a segment consisting
|
| -// of word characters and contraction characters, and discard contraction
|
| -// characters at the beginning and the end of the extracted segment.
|
| -bool SpellcheckWordIterator::GetNextWord(string16* word_string,
|
| - int* word_start,
|
| - int* word_length) {
|
| - word_string->clear();
|
| - *word_start = 0;
|
| - *word_length = 0;
|
| - while (position_ < length_) {
|
| - int segment_start = 0;
|
| - int segment_end = 0;
|
| - GetSegment(&segment_start, &segment_end);
|
| - TrimSegment(segment_start, segment_end, word_start, word_length);
|
| - if (*word_length > 0)
|
| - return Normalize(*word_start, *word_length, word_string);
|
| - }
|
| -
|
| - return false;
|
| -}
|
| -
|
| -// Retrieves a segment consisting of word characters (and contraction
|
| -// characters if the |allow_contraction_| value is true).
|
| -// When the current position refers to a non-word character, this function
|
| -// returns a non-empty segment consisting of the character itself. In this
|
| -// case, the TrimSegment() function discards the character and returns an
|
| -// empty word (i.e. |word_length| == 0).
|
| -void SpellcheckWordIterator::GetSegment(int* segment_start,
|
| - int* segment_end) {
|
| - int position = position_;
|
| - while (position < length_) {
|
| - UChar32 character;
|
| - U16_NEXT(word_, position, length_, character);
|
| - if (!attribute_->IsWordChar(character)) {
|
| - if (!allow_contraction_ || !attribute_->IsContractionChar(character))
|
| - break;
|
| - }
|
| - }
|
| - *segment_start = position_;
|
| - *segment_end = position;
|
| - position_ = position;
|
| -}
|
| -
|
| -// Discards non-word characters at the beginning and the end of the given
|
| -// segment.
|
| -void SpellcheckWordIterator::TrimSegment(int segment_start,
|
| - int segment_end,
|
| - int* word_start,
|
| - int* word_length) const {
|
| - while (segment_start < segment_end) {
|
| - UChar32 character;
|
| - int segment_next = segment_start;
|
| - U16_NEXT(word_, segment_next, segment_end, character);
|
| - if (attribute_->IsWordChar(character)) {
|
| - *word_start = segment_start;
|
| - break;
|
| - }
|
| - segment_start = segment_next;
|
| - }
|
| - while (segment_end >= segment_start) {
|
| - UChar32 character;
|
| - int segment_prev = segment_end;
|
| - U16_PREV(word_, segment_start, segment_prev, character);
|
| - if (attribute_->IsWordChar(character)) {
|
| - *word_length = segment_end - segment_start;
|
| - break;
|
| - }
|
| - segment_end = segment_prev;
|
| - }
|
| -}
|
| -
|
| -// Normalizes a non-terminated string into its canonical form so that
|
| -// a spellchecker object can check spellings of words which contain ligatures,
|
| -// full-width letters, etc.
|
| -// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
|
| -// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
|
| -// etc. For its details, please read the script table in
|
| -// "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
|
| -bool SpellcheckWordIterator::Normalize(int input_start,
|
| - int input_length,
|
| - string16* output_string) const {
|
| - // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
|
| - // does not only write NFKD and NFKC can compose ligatures into their ASCII
|
| - // alternatives, but also write NFKC keeps accents of characters.
|
| - // Therefore, NFKC seems to be the best option for hunspell.
|
| - icu::UnicodeString input(FALSE, &word_[input_start], input_length);
|
| - UErrorCode status = U_ZERO_ERROR;
|
| - icu::UnicodeString output;
|
| - icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
|
| - if (U_SUCCESS(status))
|
| - output_string->assign(output.getTerminatedBuffer());
|
| - return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING;
|
| -}
|
|
|