Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "base/i18n/break_iterator.h" | 5 #include "base/i18n/break_iterator.h" |
| 6 | 6 |
| 7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "third_party/icu/source/common/unicode/ubrk.h" | 8 #include "third_party/icu/source/common/unicode/ubrk.h" |
| 9 #include "third_party/icu/source/common/unicode/uchar.h" | 9 #include "third_party/icu/source/common/unicode/uchar.h" |
| 10 #include "third_party/icu/source/common/unicode/ustring.h" | 10 #include "third_party/icu/source/common/unicode/ustring.h" |
| 11 | 11 |
| 12 namespace base { | 12 namespace base { |
| 13 namespace i18n { | 13 namespace i18n { |
| 14 | 14 |
| 15 const size_t npos = -1; | 15 const size_t npos = -1; |
| 16 | 16 |
| 17 BreakIterator::BreakIterator(const string16& str, BreakType break_type) | 17 BreakIterator::BreakIterator(const string16& str, BreakType break_type) |
| 18 : iter_(NULL), | 18 : iter_(NULL), |
| 19 string_(str), | 19 string_(str), |
| 20 break_type_(break_type), | 20 break_type_(break_type), |
| 21 prev_(npos), | 21 prev_(npos), |
| 22 pos_(0) { | 22 pos_(0) { |
| 23 } | 23 } |
| 24 | 24 |
| 25 BreakIterator::BreakIterator(const string16& str, const string16& rules) | |
| 26 : iter_(NULL), | |
| 27 string_(str), | |
| 28 rules_(rules), | |
| 29 break_type_(RULE_BASED), | |
| 30 prev_(npos), | |
| 31 pos_(0) { | |
| 32 } | |
| 33 | |
| 25 BreakIterator::~BreakIterator() { | 34 BreakIterator::~BreakIterator() { |
| 26 if (iter_) | 35 if (iter_) |
| 27 ubrk_close(static_cast<UBreakIterator*>(iter_)); | 36 ubrk_close(static_cast<UBreakIterator*>(iter_)); |
| 28 } | 37 } |
| 29 | 38 |
| 30 bool BreakIterator::Init() { | 39 bool BreakIterator::Init() { |
| 31 UErrorCode status = U_ZERO_ERROR; | 40 UErrorCode status = U_ZERO_ERROR; |
| 41 UParseError parse_error; | |
| 32 UBreakIteratorType break_type; | 42 UBreakIteratorType break_type; |
| 33 switch (break_type_) { | 43 switch (break_type_) { |
| 34 case BREAK_CHARACTER: | 44 case BREAK_CHARACTER: |
| 35 break_type = UBRK_CHARACTER; | 45 break_type = UBRK_CHARACTER; |
| 36 break; | 46 break; |
| 37 case BREAK_WORD: | 47 case BREAK_WORD: |
| 38 break_type = UBRK_WORD; | 48 break_type = UBRK_WORD; |
| 39 break; | 49 break; |
| 40 case BREAK_LINE: | 50 case BREAK_LINE: |
| 41 case BREAK_NEWLINE: | 51 case BREAK_NEWLINE: |
| 52 case RULE_BASED: // (Keep compiler happy, break_type not used in this case) | |
| 42 break_type = UBRK_LINE; | 53 break_type = UBRK_LINE; |
| 43 break; | 54 break; |
| 44 default: | 55 default: |
| 45 NOTREACHED() << "invalid break_type_"; | 56 NOTREACHED() << "invalid break_type_"; |
| 46 return false; | 57 return false; |
| 47 } | 58 } |
| 48 iter_ = ubrk_open(break_type, NULL, | 59 if (break_type_ == RULE_BASED) { |
| 49 string_.data(), static_cast<int32_t>(string_.size()), | 60 iter_ = ubrk_openRules(rules_.c_str(), // rules string |
| 50 &status); | 61 static_cast<int32_t>(rules_.length()), |
| 62 string_.data(), // text to process | |
| 63 static_cast<int32_t>(string_.size()), | |
| 64 &parse_error, // rule parsing errors out-param | |
| 65 &status); // status code out-param | |
| 66 if (U_FAILURE(status)) { | |
| 67 NOTREACHED() << "ubrk_openRules failed to parse rule string at line " | |
| 68 << parse_error.line << ", offset " << parse_error.offset; | |
| 69 } | |
| 70 } else { | |
| 71 iter_ = ubrk_open(break_type, // break iterator type code | |
| 72 NULL, // locale | |
| 73 string_.data(), // text to process | |
| 74 static_cast<int32_t>(string_.size()), // text length | |
| 75 &status); // status code out-param | |
| 76 if (U_FAILURE(status)) { | |
| 77 NOTREACHED() << "ubrk_open failed"; | |
| 78 } | |
| 79 } | |
| 80 | |
| 51 if (U_FAILURE(status)) { | 81 if (U_FAILURE(status)) { |
| 52 NOTREACHED() << "ubrk_open failed"; | |
| 53 return false; | 82 return false; |
| 54 } | 83 } |
| 84 | |
| 55 // Move the iterator to the beginning of the string. | 85 // Move the iterator to the beginning of the string. |
| 56 ubrk_first(static_cast<UBreakIterator*>(iter_)); | 86 ubrk_first(static_cast<UBreakIterator*>(iter_)); |
| 57 return true; | 87 return true; |
| 58 } | 88 } |
| 59 | 89 |
| 60 bool BreakIterator::Advance() { | 90 bool BreakIterator::Advance() { |
| 61 int32_t pos; | 91 int32_t pos; |
| 62 int32_t status; | 92 int32_t status; |
| 63 prev_ = pos_; | 93 prev_ = pos_; |
| 64 switch (break_type_) { | 94 switch (break_type_) { |
| 65 case BREAK_CHARACTER: | 95 case BREAK_CHARACTER: |
| 66 case BREAK_WORD: | 96 case BREAK_WORD: |
| 67 case BREAK_LINE: | 97 case BREAK_LINE: |
| 98 case RULE_BASED: | |
| 68 pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); | 99 pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); |
| 69 if (pos == UBRK_DONE) { | 100 if (pos == UBRK_DONE) { |
| 70 pos_ = npos; | 101 pos_ = npos; |
| 71 return false; | 102 return false; |
| 72 } | 103 } |
| 73 pos_ = static_cast<size_t>(pos); | 104 pos_ = static_cast<size_t>(pos); |
| 74 return true; | 105 return true; |
| 75 case BREAK_NEWLINE: | 106 case BREAK_NEWLINE: |
| 76 do { | 107 do { |
| 77 pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); | 108 pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); |
| 78 if (pos == UBRK_DONE) | 109 if (pos == UBRK_DONE) |
| 79 break; | 110 break; |
| 80 pos_ = static_cast<size_t>(pos); | 111 pos_ = static_cast<size_t>(pos); |
| 81 status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); | 112 status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); |
| 82 } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT); | 113 } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT); |
| 83 if (pos == UBRK_DONE && prev_ == pos_) { | 114 if (pos == UBRK_DONE && prev_ == pos_) { |
| 84 pos_ = npos; | 115 pos_ = npos; |
| 85 return false; | 116 return false; |
| 86 } | 117 } |
| 87 return true; | 118 return true; |
| 88 default: | 119 default: |
| 89 NOTREACHED() << "invalid break_type_"; | 120 NOTREACHED() << "invalid break_type_"; |
| 90 return false; | 121 return false; |
| 91 } | 122 } |
| 92 } | 123 } |
| 93 | 124 |
| 125 bool BreakIterator::SetText(const base::char16* text, const size_t length) { | |
| 126 UErrorCode status = U_ZERO_ERROR; | |
| 127 ubrk_setText(static_cast<UBreakIterator*>(iter_), | |
| 128 text, length, &status); | |
| 129 pos_ = ubrk_first(static_cast<UBreakIterator*>(iter_)); | |
| 130 prev_ = npos; | |
| 131 if (U_FAILURE(status)) { | |
| 132 NOTREACHED() << "ubrk_setText failed"; | |
| 133 return false; | |
| 134 } | |
| 135 return true; | |
| 136 } | |
| 137 | |
| 94 bool BreakIterator::IsWord() const { | 138 bool BreakIterator::IsWord() const { |
| 95 int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); | 139 int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); |
| 96 return (break_type_ == BREAK_WORD && status != UBRK_WORD_NONE); | 140 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) |
|
groby-ooo-7-16
2014/05/08 17:52:00
nit: Is there a point in having BreakIterator::IsW
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
Maybe. My hope is actually that we can throw some
| |
| 141 return false; | |
| 142 return status != UBRK_WORD_NONE; | |
| 97 } | 143 } |
| 98 | 144 |
| 99 bool BreakIterator::IsEndOfWord(size_t position) const { | 145 bool BreakIterator::IsEndOfWord(size_t position) const { |
| 100 if (break_type_ != BREAK_WORD) | 146 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) |
| 101 return false; | 147 return false; |
| 102 | 148 |
| 103 UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); | 149 UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); |
| 104 UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); | 150 UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); |
| 105 int32_t status = ubrk_getRuleStatus(iter); | 151 int32_t status = ubrk_getRuleStatus(iter); |
| 106 return (!!boundary && status != UBRK_WORD_NONE); | 152 return (!!boundary && status != UBRK_WORD_NONE); |
| 107 } | 153 } |
| 108 | 154 |
| 109 bool BreakIterator::IsStartOfWord(size_t position) const { | 155 bool BreakIterator::IsStartOfWord(size_t position) const { |
| 110 if (break_type_ != BREAK_WORD) | 156 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) |
| 111 return false; | 157 return false; |
| 112 | 158 |
| 113 UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); | 159 UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); |
| 114 UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); | 160 UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); |
| 115 ubrk_next(iter); | 161 ubrk_next(iter); |
| 116 int32_t next_status = ubrk_getRuleStatus(iter); | 162 int32_t next_status = ubrk_getRuleStatus(iter); |
| 117 return (!!boundary && next_status != UBRK_WORD_NONE); | 163 return (!!boundary && next_status != UBRK_WORD_NONE); |
| 118 } | 164 } |
| 119 | 165 |
| 120 string16 BreakIterator::GetString() const { | 166 string16 BreakIterator::GetString() const { |
| 121 DCHECK(prev_ != npos && pos_ != npos); | 167 DCHECK(prev_ != npos && pos_ != npos); |
| 122 return string_.substr(prev_, pos_ - prev_); | 168 return string_.substr(prev_, pos_ - prev_); |
| 123 } | 169 } |
| 124 | 170 |
| 125 } // namespace i18n | 171 } // namespace i18n |
| 126 } // namespace base | 172 } // namespace base |
| OLD | NEW |