OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef BASE_I18N_BREAK_ITERATOR_H_ | 5 #ifndef BASE_I18N_BREAK_ITERATOR_H_ |
6 #define BASE_I18N_BREAK_ITERATOR_H_ | 6 #define BASE_I18N_BREAK_ITERATOR_H_ |
7 #pragma once | 7 #pragma once |
8 | 8 |
9 #include "base/basictypes.h" | 9 #include "base/basictypes.h" |
10 #include "base/string16.h" | 10 #include "base/string16.h" |
11 | 11 |
12 // The BreakIterator class iterates through the words, word breaks, and | 12 // The BreakIterator class iterates through the words, word breaks, and |
13 // line breaks in a UTF-16 string. | 13 // line breaks in a UTF-16 string. |
14 // | 14 // |
15 // It provides several modes, BREAK_WORD, BREAK_SPACE, and BREAK_NEWLINE, | 15 // It provides several modes, BREAK_WORD, BREAK_LINE, and BREAK_NEWLINE, |
16 // which modify how characters are aggregated into the returned string. | 16 // which modify how characters are aggregated into the returned string. |
17 // | 17 // |
18 // Under BREAK_WORD mode, once a word is encountered any non-word | 18 // Under BREAK_WORD mode, once a word is encountered any non-word |
19 // characters are not included in the returned string (e.g. in the | 19 // characters are not included in the returned string (e.g. in the |
20 // UTF-16 equivalent of the string " foo bar! ", the word breaks are at | 20 // UTF-16 equivalent of the string " foo bar! ", the word breaks are at |
21 // the periods in ". .foo. .bar.!. ."). | 21 // the periods in ". .foo. .bar.!. ."). |
| 22 // Note that Chinese/Japanese/Thai do not use spaces between words so that |
| 23 // boundaries can fall in the middle of a continuous run of non-space / |
| 24 // non-punctuation characters. |
22 // | 25 // |
23 // Under BREAK_SPACE mode, once a word is encountered, any non-word | 26 // Under BREAK_LINE mode, once a line breaking opportunity is encountered, |
24 // characters are included in the returned string, breaking only when a | 27 // any non-word characters are included in the returned string, breaking |
25 // space-equivalent character is encountered (e.g. in the | 28 // only when a space-equivalent character or a line breaking opportunity |
26 // UTF16-equivalent of the string " foo bar! ", the word breaks are at | 29 // is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ", |
27 // the periods in ". .foo .bar! ."). | 30 // the breaks are at the periods in ". .foo .bar! ."). |
| 31 // |
| 32 // Note that lines can be broken at any character/syllable/grapheme cluster |
| 33 // boundary in Chinese/Japanese/Korean and at word boundaries in Thai |
| 34 // (Thai does not use spaces between words). Therefore, this is NOT the same |
| 35 // as breaking only at space-equivalent characters where its former |
| 36 // name (BREAK_SPACE) implied. |
28 // | 37 // |
29 // Under BREAK_NEWLINE mode, all characters are included in the returned | 38 // Under BREAK_NEWLINE mode, all characters are included in the returned |
30 // string, breking only when a newline-equivalent character is encountered | 39 // string, breking only when a newline-equivalent character is encountered |
31 // (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line | 40 // (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line |
32 // breaks are at the periods in ".foo\n.bar\n.\n."). | 41 // breaks are at the periods in ".foo\n.bar\n.\n."). |
33 // | 42 // |
34 // To extract the words from a string, move a BREAK_WORD BreakIterator | 43 // To extract the words from a string, move a BREAK_WORD BreakIterator |
35 // through the string and test whether IsWord() is true. E.g., | 44 // through the string and test whether IsWord() is true. E.g., |
36 // BreakIterator iter(&str, BreakIterator::BREAK_WORD); | 45 // BreakIterator iter(&str, BreakIterator::BREAK_WORD); |
37 // if (!iter.Init()) return false; | 46 // if (!iter.Init()) return false; |
38 // while (iter.Advance()) { | 47 // while (iter.Advance()) { |
39 // if (iter.IsWord()) { | 48 // if (iter.IsWord()) { |
40 // // region [iter.prev(),iter.pos()) contains a word. | 49 // // region [iter.prev(),iter.pos()) contains a word. |
41 // VLOG(1) << "word: " << iter.GetString(); | 50 // VLOG(1) << "word: " << iter.GetString(); |
42 // } | 51 // } |
43 // } | 52 // } |
44 | 53 |
45 namespace base { | 54 namespace base { |
46 | 55 |
47 class BreakIterator { | 56 class BreakIterator { |
48 public: | 57 public: |
49 enum BreakType { | 58 enum BreakType { |
50 BREAK_WORD, | 59 BREAK_WORD, |
51 BREAK_SPACE, | 60 BREAK_LINE, |
| 61 // TODO(jshin): Remove this after reviewing call sites. |
| 62 // If call sites really need break only on space-like characters |
| 63 // implement it separately. |
| 64 BREAK_SPACE = BREAK_LINE, |
52 BREAK_NEWLINE, | 65 BREAK_NEWLINE, |
53 }; | 66 }; |
54 | 67 |
55 // Requires |str| to live as long as the BreakIterator does. | 68 // Requires |str| to live as long as the BreakIterator does. |
56 BreakIterator(const string16* str, BreakType break_type); | 69 BreakIterator(const string16* str, BreakType break_type); |
57 ~BreakIterator(); | 70 ~BreakIterator(); |
58 | 71 |
59 // Init() must be called before any of the iterators are valid. | 72 // Init() must be called before any of the iterators are valid. |
60 // Returns false if ICU failed to initialize. | 73 // Returns false if ICU failed to initialize. |
61 bool Init(); | 74 bool Init(); |
62 | 75 |
63 // Return the current break position within the string, | 76 // Return the current break position within the string, |
64 // or BreakIterator::npos when done. | 77 // or BreakIterator::npos when done. |
65 size_t pos() const { return pos_; } | 78 size_t pos() const { return pos_; } |
66 | 79 |
67 // Return the value of pos() returned before Advance() was last called. | 80 // Return the value of pos() returned before Advance() was last called. |
68 size_t prev() const { return prev_; } | 81 size_t prev() const { return prev_; } |
69 | 82 |
70 // Advance to the next break. Returns false if we've run past the end of | 83 // Advance to the next break. Returns false if we've run past the end of |
71 // the string. (Note that the very last "break" is after the final | 84 // the string. (Note that the very last "break" is after the final |
72 // character in the string, and when we advance to that position it's the | 85 // character in the string, and when we advance to that position it's the |
73 // last time Advance() returns true.) | 86 // last time Advance() returns true.) |
74 bool Advance(); | 87 bool Advance(); |
75 | 88 |
76 // Under BREAK_WORD mode, returns true if the break we just hit is the | 89 // Under BREAK_WORD mode, returns true if the break we just hit is the |
77 // end of a word. (Otherwise, the break iterator just skipped over e.g. | 90 // end of a word. (Otherwise, the break iterator just skipped over e.g. |
78 // whitespace or punctuation.) Under BREAK_SPACE and BREAK_NEWLINE modes, | 91 // whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes, |
79 // this distinction doesn't apply and it always retuns false. | 92 // this distinction doesn't apply and it always retuns false. |
80 bool IsWord() const; | 93 bool IsWord() const; |
81 | 94 |
82 // Return the string between prev() and pos(). | 95 // Return the string between prev() and pos(). |
83 // Advance() must have been called successfully at least once | 96 // Advance() must have been called successfully at least once |
84 // for pos() to have advanced to somewhere useful. | 97 // for pos() to have advanced to somewhere useful. |
85 string16 GetString() const; | 98 string16 GetString() const; |
86 | 99 |
87 private: | 100 private: |
88 // ICU iterator, avoiding ICU ubrk.h dependence. | 101 // ICU iterator, avoiding ICU ubrk.h dependence. |
(...skipping 10 matching lines...) Expand all Loading... |
99 | 112 |
100 // Previous and current iterator positions. | 113 // Previous and current iterator positions. |
101 size_t prev_, pos_; | 114 size_t prev_, pos_; |
102 | 115 |
103 DISALLOW_COPY_AND_ASSIGN(BreakIterator); | 116 DISALLOW_COPY_AND_ASSIGN(BreakIterator); |
104 }; | 117 }; |
105 | 118 |
106 } // namespace base | 119 } // namespace base |
107 | 120 |
108 #endif // BASE_I18N_BREAK_ITERATOR_H__ | 121 #endif // BASE_I18N_BREAK_ITERATOR_H__ |
OLD | NEW |