| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #ifndef BASE_I18N_BREAK_ITERATOR_H_ | |
| 6 #define BASE_I18N_BREAK_ITERATOR_H_ | |
| 7 | |
| 8 #include "base/basictypes.h" | |
| 9 #include "base/i18n/base_i18n_export.h" | |
| 10 #include "base/strings/string16.h" | |
| 11 #include "base/strings/string_piece.h" | |
| 12 | |
| 13 // The BreakIterator class iterates through the words, word breaks, and | |
| 14 // line breaks in a UTF-16 string. | |
| 15 // | |
| 16 // It provides several modes, BREAK_WORD, BREAK_LINE, and BREAK_NEWLINE, | |
| 17 // which modify how characters are aggregated into the returned string. | |
| 18 // | |
| 19 // Under BREAK_WORD mode, once a word is encountered any non-word | |
| 20 // characters are not included in the returned string (e.g. in the | |
| 21 // UTF-16 equivalent of the string " foo bar! ", the word breaks are at | |
| 22 // the periods in ". .foo. .bar.!. ."). | |
| 23 // Note that Chinese/Japanese/Thai do not use spaces between words so that | |
| 24 // boundaries can fall in the middle of a continuous run of non-space / | |
| 25 // non-punctuation characters. | |
| 26 // | |
| 27 // Under BREAK_LINE mode, once a line breaking opportunity is encountered, | |
| 28 // any non-word characters are included in the returned string, breaking | |
| 29 // only when a space-equivalent character or a line breaking opportunity | |
| 30 // is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ", | |
| 31 // the breaks are at the periods in ". .foo .bar! ."). | |
| 32 // | |
| 33 // Note that lines can be broken at any character/syllable/grapheme cluster | |
| 34 // boundary in Chinese/Japanese/Korean and at word boundaries in Thai | |
| 35 // (Thai does not use spaces between words). Therefore, this is NOT the same | |
| 36 // as breaking only at space-equivalent characters where its former | |
| 37 // name (BREAK_SPACE) implied. | |
| 38 // | |
| 39 // Under BREAK_NEWLINE mode, all characters are included in the returned | |
| 40 // string, breaking only when a newline-equivalent character is encountered | |
| 41 // (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line | |
| 42 // breaks are at the periods in ".foo\n.bar\n.\n."). | |
| 43 // | |
| 44 // To extract the words from a string, move a BREAK_WORD BreakIterator | |
| 45 // through the string and test whether IsWord() is true. E.g., | |
| 46 // BreakIterator iter(str, BreakIterator::BREAK_WORD); | |
| 47 // if (!iter.Init()) | |
| 48 // return false; | |
| 49 // while (iter.Advance()) { | |
| 50 // if (iter.IsWord()) { | |
| 51 // // Region [iter.prev(), iter.pos()) contains a word. | |
| 52 // VLOG(1) << "word: " << iter.GetString(); | |
| 53 // } | |
| 54 // } | |
| 55 | |
| 56 namespace base { | |
| 57 namespace i18n { | |
| 58 | |
| 59 class BASE_I18N_EXPORT BreakIterator { | |
| 60 public: | |
| 61 enum BreakType { | |
| 62 BREAK_WORD, | |
| 63 BREAK_LINE, | |
| 64 // TODO(jshin): Remove this after reviewing call sites. | |
| 65 // If call sites really need break only on space-like characters | |
| 66 // implement it separately. | |
| 67 BREAK_SPACE = BREAK_LINE, | |
| 68 BREAK_NEWLINE, | |
| 69 BREAK_CHARACTER, | |
| 70 // But don't remove this one! | |
| 71 RULE_BASED, | |
| 72 }; | |
| 73 | |
| 74 // Requires |str| to live as long as the BreakIterator does. | |
| 75 BreakIterator(const StringPiece16& str, BreakType break_type); | |
| 76 // Make a rule-based iterator. BreakType == RULE_BASED is implied. | |
| 77 // TODO(andrewhayden): This signature could easily be misinterpreted as | |
| 78 // "(const string16& str, const string16& locale)". We should do something | |
| 79 // better. | |
| 80 BreakIterator(const StringPiece16& str, const string16& rules); | |
| 81 ~BreakIterator(); | |
| 82 | |
| 83 // Init() must be called before any of the iterators are valid. | |
| 84 // Returns false if ICU failed to initialize. | |
| 85 bool Init(); | |
| 86 | |
| 87 // Advance to the next break. Returns false if we've run past the end of | |
| 88 // the string. (Note that the very last "break" is after the final | |
| 89 // character in the string, and when we advance to that position it's the | |
| 90 // last time Advance() returns true.) | |
| 91 bool Advance(); | |
| 92 | |
| 93 // Updates the text used by the iterator, resetting the iterator as if | |
| 94 // if Init() had been called again. Any old state is lost. Returns true | |
| 95 // unless there is an error setting the text. | |
| 96 bool SetText(const base::char16* text, const size_t length); | |
| 97 | |
| 98 // Under BREAK_WORD mode, returns true if the break we just hit is the | |
| 99 // end of a word. (Otherwise, the break iterator just skipped over e.g. | |
| 100 // whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes, | |
| 101 // this distinction doesn't apply and it always returns false. | |
| 102 bool IsWord() const; | |
| 103 | |
| 104 // Under BREAK_WORD mode, returns true if |position| is at the end of word or | |
| 105 // at the start of word. It always returns false under BREAK_LINE and | |
| 106 // BREAK_NEWLINE modes. | |
| 107 bool IsEndOfWord(size_t position) const; | |
| 108 bool IsStartOfWord(size_t position) const; | |
| 109 | |
| 110 // Under BREAK_CHARACTER mode, returns whether |position| is a Unicode | |
| 111 // grapheme boundary. | |
| 112 bool IsGraphemeBoundary(size_t position) const; | |
| 113 | |
| 114 // Returns the string between prev() and pos(). | |
| 115 // Advance() must have been called successfully at least once for pos() to | |
| 116 // have advanced to somewhere useful. | |
| 117 string16 GetString() const; | |
| 118 | |
| 119 StringPiece16 GetStringPiece() const; | |
| 120 | |
| 121 // Returns the value of pos() returned before Advance() was last called. | |
| 122 size_t prev() const { return prev_; } | |
| 123 | |
| 124 // Returns the current break position within the string, | |
| 125 // or BreakIterator::npos when done. | |
| 126 size_t pos() const { return pos_; } | |
| 127 | |
| 128 private: | |
| 129 // ICU iterator, avoiding ICU ubrk.h dependence. | |
| 130 // This is actually an ICU UBreakiterator* type, which turns out to be | |
| 131 // a typedef for a void* in the ICU headers. Using void* directly prevents | |
| 132 // callers from needing access to the ICU public headers directory. | |
| 133 void* iter_; | |
| 134 | |
| 135 // The string we're iterating over. Can be changed with SetText(...) | |
| 136 StringPiece16 string_; | |
| 137 | |
| 138 // Rules for our iterator. Mutually exclusive with break_type_. | |
| 139 const string16 rules_; | |
| 140 | |
| 141 // The breaking style (word/space/newline). Mutually exclusive with rules_ | |
| 142 BreakType break_type_; | |
| 143 | |
| 144 // Previous and current iterator positions. | |
| 145 size_t prev_, pos_; | |
| 146 | |
| 147 DISALLOW_COPY_AND_ASSIGN(BreakIterator); | |
| 148 }; | |
| 149 | |
| 150 } // namespace i18n | |
| 151 } // namespace base | |
| 152 | |
| 153 #endif // BASE_I18N_BREAK_ITERATOR_H_ | |
| OLD | NEW |