OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef BASE_I18N_BREAK_ITERATOR_H_ | 5 #ifndef BASE_I18N_BREAK_ITERATOR_H_ |
6 #define BASE_I18N_BREAK_ITERATOR_H_ | 6 #define BASE_I18N_BREAK_ITERATOR_H_ |
7 #pragma once | 7 #pragma once |
8 | 8 |
9 #include "base/basictypes.h" | 9 #include "base/basictypes.h" |
10 #include "base/string16.h" | 10 #include "base/string16.h" |
(...skipping 23 matching lines...) Expand all Loading... |
34 // (Thai does not use spaces between words). Therefore, this is NOT the same | 34 // (Thai does not use spaces between words). Therefore, this is NOT the same |
35 // as breaking only at space-equivalent characters where its former | 35 // as breaking only at space-equivalent characters where its former |
36 // name (BREAK_SPACE) implied. | 36 // name (BREAK_SPACE) implied. |
37 // | 37 // |
38 // Under BREAK_NEWLINE mode, all characters are included in the returned | 38 // Under BREAK_NEWLINE mode, all characters are included in the returned |
39 // string, breking only when a newline-equivalent character is encountered | 39 // string, breking only when a newline-equivalent character is encountered |
40 // (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line | 40 // (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line |
41 // breaks are at the periods in ".foo\n.bar\n.\n."). | 41 // breaks are at the periods in ".foo\n.bar\n.\n."). |
42 // | 42 // |
43 // To extract the words from a string, move a BREAK_WORD BreakIterator | 43 // To extract the words from a string, move a BREAK_WORD BreakIterator |
44 // through the string and test whether IsWord() is true. E.g., | 44 // through the string and test whether IsWord() is true. E.g., |
45 // BreakIterator iter(&str, BreakIterator::BREAK_WORD); | 45 // BreakIterator iter(str, BreakIterator::BREAK_WORD); |
46 // if (!iter.Init()) return false; | 46 // if (!iter.Init()) |
| 47 // return false; |
47 // while (iter.Advance()) { | 48 // while (iter.Advance()) { |
48 // if (iter.IsWord()) { | 49 // if (iter.IsWord()) { |
49 // // region [iter.prev(),iter.pos()) contains a word. | 50 // // Region [iter.prev(), iter.pos()) contains a word. |
50 // VLOG(1) << "word: " << iter.GetString(); | 51 // VLOG(1) << "word: " << iter.GetString(); |
51 // } | 52 // } |
52 // } | 53 // } |
53 | 54 |
54 namespace base { | 55 namespace base { |
55 namespace i18n { | 56 namespace i18n { |
56 | 57 |
57 class BreakIterator { | 58 class BreakIterator { |
58 public: | 59 public: |
59 enum BreakType { | 60 enum BreakType { |
60 BREAK_WORD, | 61 BREAK_WORD, |
61 BREAK_LINE, | 62 BREAK_LINE, |
62 // TODO(jshin): Remove this after reviewing call sites. | 63 // TODO(jshin): Remove this after reviewing call sites. |
63 // If call sites really need break only on space-like characters | 64 // If call sites really need break only on space-like characters |
64 // implement it separately. | 65 // implement it separately. |
65 BREAK_SPACE = BREAK_LINE, | 66 BREAK_SPACE = BREAK_LINE, |
66 BREAK_NEWLINE, | 67 BREAK_NEWLINE, |
67 }; | 68 }; |
68 | 69 |
69 // Requires |str| to live as long as the BreakIterator does. | 70 // Requires |str| to live as long as the BreakIterator does. |
70 BreakIterator(const string16& str, BreakType break_type); | 71 BreakIterator(const string16& str, BreakType break_type); |
71 ~BreakIterator(); | 72 ~BreakIterator(); |
72 | 73 |
73 // Init() must be called before any of the iterators are valid. | 74 // Init() must be called before any of the iterators are valid. |
74 // Returns false if ICU failed to initialize. | 75 // Returns false if ICU failed to initialize. |
75 bool Init(); | 76 bool Init(); |
76 | 77 |
77 // Return the current break position within the string, | |
78 // or BreakIterator::npos when done. | |
79 size_t pos() const { return pos_; } | |
80 | |
81 // Return the value of pos() returned before Advance() was last called. | |
82 size_t prev() const { return prev_; } | |
83 | |
84 // Advance to the next break. Returns false if we've run past the end of | 78 // Advance to the next break. Returns false if we've run past the end of |
85 // the string. (Note that the very last "break" is after the final | 79 // the string. (Note that the very last "break" is after the final |
86 // character in the string, and when we advance to that position it's the | 80 // character in the string, and when we advance to that position it's the |
87 // last time Advance() returns true.) | 81 // last time Advance() returns true.) |
88 bool Advance(); | 82 bool Advance(); |
89 | 83 |
90 // Under BREAK_WORD mode, returns true if the break we just hit is the | 84 // Under BREAK_WORD mode, returns true if the break we just hit is the |
91 // end of a word. (Otherwise, the break iterator just skipped over e.g. | 85 // end of a word. (Otherwise, the break iterator just skipped over e.g. |
92 // whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes, | 86 // whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes, |
93 // this distinction doesn't apply and it always retuns false. | 87 // this distinction doesn't apply and it always retuns false. |
94 bool IsWord() const; | 88 bool IsWord() const; |
95 | 89 |
96 // Return the string between prev() and pos(). | 90 // Returns the string between prev() and pos(). |
97 // Advance() must have been called successfully at least once | 91 // Advance() must have been called successfully at least once for pos() to |
98 // for pos() to have advanced to somewhere useful. | 92 // have advanced to somewhere useful. |
99 string16 GetString() const; | 93 string16 GetString() const; |
100 | 94 |
| 95 // Returns the value of pos() returned before Advance() was last called. |
| 96 size_t prev() const { return prev_; } |
| 97 |
| 98 // Returns the current break position within the string, |
| 99 // or BreakIterator::npos when done. |
| 100 size_t pos() const { return pos_; } |
| 101 |
101 private: | 102 private: |
102 // ICU iterator, avoiding ICU ubrk.h dependence. | 103 // ICU iterator, avoiding ICU ubrk.h dependence. |
103 // This is actually an ICU UBreakiterator* type, which turns out to be | 104 // This is actually an ICU UBreakiterator* type, which turns out to be |
104 // a typedef for a void* in the ICU headers. Using void* directly prevents | 105 // a typedef for a void* in the ICU headers. Using void* directly prevents |
105 // callers from needing access to the ICU public headers directory. | 106 // callers from needing access to the ICU public headers directory. |
106 void* iter_; | 107 void* iter_; |
107 | 108 |
108 // The string we're iterating over. | 109 // The string we're iterating over. |
109 const string16& string_; | 110 const string16& string_; |
110 | 111 |
111 // The breaking style (word/space/newline). | 112 // The breaking style (word/space/newline). |
112 BreakType break_type_; | 113 BreakType break_type_; |
113 | 114 |
114 // Previous and current iterator positions. | 115 // Previous and current iterator positions. |
115 size_t prev_, pos_; | 116 size_t prev_, pos_; |
116 | 117 |
117 DISALLOW_COPY_AND_ASSIGN(BreakIterator); | 118 DISALLOW_COPY_AND_ASSIGN(BreakIterator); |
118 }; | 119 }; |
119 | 120 |
120 } // namespace i18n | 121 } // namespace i18n |
121 } // namespace base | 122 } // namespace base |
122 | 123 |
123 #endif // BASE_I18N_BREAK_ITERATOR_H_ | 124 #endif // BASE_I18N_BREAK_ITERATOR_H_ |
OLD | NEW |