OLD | NEW |
| (Empty) |
1 // Copyright (C) 2006 Google Inc. | |
2 // | |
3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
4 // you may not use this file except in compliance with the License. | |
5 // You may obtain a copy of the License at | |
6 // | |
7 // http://www.apache.org/licenses/LICENSE-2.0 | |
8 // | |
9 // Unless required by applicable law or agreed to in writing, software | |
10 // distributed under the License is distributed on an "AS IS" BASIS, | |
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 // See the License for the specific language governing permissions and | |
13 // limitations under the License. | |
14 | |
15 // Author: Jim Meehan | |
16 | |
17 #ifndef UTIL_UTF8_UNICODETEXT_H__ | |
18 #define UTIL_UTF8_UNICODETEXT_H__ | |
19 | |
20 #include <iterator> | |
21 #include <string> | |
22 #include <utility> | |
23 #include "base/basictypes.h" | |
24 //#include "util/utf8/public/config.h" | |
25 | |
26 using std::string; | |
27 using std::bidirectional_iterator_tag; | |
28 using std::pair; | |
29 | |
30 // ***************************** UnicodeText ************************** | |
31 // | |
32 // A UnicodeText object is a container for a sequence of Unicode | |
33 // codepoint values. It has default, copy, and assignment constructors. | |
34 // Data can be appended to it from another UnicodeText, from | |
35 // iterators, or from a single codepoint. | |
36 // | |
37 // The internal representation of the text is UTF-8. Since UTF-8 is a | |
38 // variable-width format, UnicodeText does not provide random access | |
39 // to the text, and changes to the text are permitted only at the end. | |
40 // | |
41 // The UnicodeText class defines a const_iterator. The dereferencing | |
42 // operator (*) returns a codepoint (char32). The iterator is a | |
43 // bidirectional, read-only iterator. It becomes invalid if the text | |
44 // is changed. | |
45 // | |
46 // There are methods for appending and retrieving UTF-8 data directly. | |
47 // The 'utf8_data' method returns a const char* that contains the | |
48 // UTF-8-encoded version of the text; 'utf8_length' returns the number | |
49 // of bytes in the UTF-8 data. An iterator's 'get' method stores up to | |
50 // 4 bytes of UTF-8 data in a char array and returns the number of | |
51 // bytes that it stored. | |
52 // | |
53 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, | |
54 // 0x10FFFF], but UnicodeText has the additional restriction that it | |
55 // can contain only those characters that are valid for interchange on | |
56 // the Web. This excludes all of the control codes except for carriage | |
57 // return, line feed, and horizontal tab. It also excludes | |
58 // non-characters, but codepoints that are in the Private Use regions | |
59 // are allowed, as are codepoints that are unassigned. (See the | |
60 // Unicode reference for details.) The function UniLib::IsInterchangeValid | |
61 // can be used as a test for this property. | |
62 // | |
63 // UnicodeTexts are safe. Every method that constructs or modifies a | |
64 // UnicodeText tests for interchange-validity, and will substitute a | |
65 // space for the invalid data. Such cases are reported via | |
66 // LOG(WARNING). | |
67 // | |
68 // MEMORY MANAGEMENT: copy, take ownership, or point to | |
69 // | |
70 // A UnicodeText is either an "owner", meaning that it owns the memory | |
71 // for the data buffer and will free it when the UnicodeText is | |
72 // destroyed, or it is an "alias", meaning that it does not. | |
73 // | |
74 // There are three methods for storing UTF-8 data in a UnicodeText: | |
75 // | |
76 // CopyUTF8(buffer, len) copies buffer. | |
77 // | |
78 // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. | |
79 // | |
80 // PointToUTF8(buffer, size) creates an alias pointing to buffer. | |
81 // | |
82 // All three methods perform a validity check on the buffer. There are | |
83 // private, "unsafe" versions of these functions that bypass the | |
84 // validity check. They are used internally and by friend-functions | |
85 // that are handling UTF-8 data that has already been validated. | |
86 // | |
87 // The purpose of an alias is to avoid making an unnecessary copy of a | |
88 // UTF-8 buffer while still providing access to the Unicode values | |
89 // within that text through iterators or the fast scanners that are | |
90 // based on UTF-8 state tables. The lifetime of an alias must not | |
91 // exceed the lifetime of the buffer from which it was constructed. | |
92 // | |
93 // The semantics of an alias might be described as "copy on write or | |
94 // repair." The source data is never modified. If push_back() or | |
95 // append() is called on an alias, a copy of the data will be created, | |
96 // and the UnicodeText will become an owner. If clear() is called on | |
97 // an alias, it becomes an (empty) owner. | |
98 // | |
99 // The copy constructor and the assignment operator produce an owner. | |
100 // That is, after direct initialization ("UnicodeText x(y);") or copy | |
101 // initialization ("UnicodeText x = y;") x will be an owner, even if y | |
102 // was an alias. The assignment operator ("x = y;") also produces an | |
103 // owner unless x and y are the same object and y is an alias. | |
104 // | |
105 // Aliases should be used with care. If the source from which an alias | |
106 // was created is freed, or if the contents are changed, while the | |
107 // alias is still in use, fatal errors could result. But it can be | |
108 // quite useful to have a UnicodeText "window" through which to see a | |
109 // UTF-8 buffer without having to pay the price of making a copy. | |
110 // | |
111 // UTILITIES | |
112 // | |
113 // The interfaces in util/utf8/public/textutils.h provide higher-level | |
114 // utilities for dealing with UnicodeTexts, including routines for | |
115 // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or | |
116 // strings, creating strings from UnicodeTexts, normalizing text for | |
117 // efficient matching or display, and others. | |
118 | |
119 class UnicodeText { | |
120 public: | |
121 class const_iterator; | |
122 | |
123 typedef char32 value_type; | |
124 | |
125 // Constructors. These always produce owners. | |
126 UnicodeText(); // Create an empty text. | |
127 UnicodeText(const UnicodeText& src); // copy constructor | |
128 // Construct a substring (copies the data). | |
129 UnicodeText(const const_iterator& first, const const_iterator& last); | |
130 | |
131 // Assignment operator. This copies the data and produces an owner | |
132 // unless this == &src, e.g., "x = x;", which is a no-op. | |
133 UnicodeText& operator=(const UnicodeText& src); | |
134 | |
135 // x.Copy(y) copies the data from y into x. | |
136 UnicodeText& Copy(const UnicodeText& src); | |
137 inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } | |
138 | |
139 // x.PointTo(y) changes x so that it points to y's data. | |
140 // It does not copy y or take ownership of y's data. | |
141 UnicodeText& PointTo(const UnicodeText& src); | |
142 UnicodeText& PointTo(const const_iterator& first, | |
143 const const_iterator& last); | |
144 | |
145 ~UnicodeText(); | |
146 | |
147 void clear(); // Clear text. | |
148 bool empty() { return repr_.size_ == 0; } // Test if text is empty. | |
149 | |
150 // Add a codepoint to the end of the text. | |
151 // If the codepoint is not interchange-valid, add a space instead | |
152 // and log a warning. | |
153 void push_back(char32 codepoint); | |
154 | |
155 // Generic appending operation. | |
156 // iterator_traits<ForwardIterator>::value_type must be implicitly | |
157 // convertible to char32. Typical uses of this method might include: | |
158 // char32 chars[] = {0x1, 0x2, ...}; | |
159 // vector<char32> more_chars = ...; | |
160 // utext.append(chars, chars+arraysize(chars)); | |
161 // utext.append(more_chars.begin(), more_chars.end()); | |
162 template<typename ForwardIterator> | |
163 UnicodeText& append(ForwardIterator first, const ForwardIterator last) { | |
164 while (first != last) { push_back(*first++); } | |
165 return *this; | |
166 } | |
167 | |
168 // A specialization of the generic append() method. | |
169 UnicodeText& append(const const_iterator& first, const const_iterator& last); | |
170 | |
171 // An optimization of append(source.begin(), source.end()). | |
172 UnicodeText& append(const UnicodeText& source); | |
173 | |
174 int size() const; // the number of Unicode characters (codepoints) | |
175 | |
176 friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); | |
177 friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); | |
178 | |
179 class const_iterator { | |
180 typedef const_iterator CI; | |
181 public: | |
182 typedef bidirectional_iterator_tag iterator_category; | |
183 typedef char32 value_type; | |
184 typedef ptrdiff_t difference_type; | |
185 typedef void pointer; // (Not needed.) | |
186 typedef const char32 reference; // (Needed for const_reverse_iterator) | |
187 | |
188 // Iterators are default-constructible. | |
189 const_iterator(); | |
190 | |
191 // It's safe to make multiple passes over a UnicodeText. | |
192 const_iterator(const const_iterator& other); | |
193 const_iterator& operator=(const const_iterator& other); | |
194 | |
195 char32 operator*() const; // Dereference | |
196 | |
197 const_iterator& operator++(); // Advance (++iter) | |
198 const_iterator operator++(int) { // (iter++) | |
199 const_iterator result(*this); | |
200 ++*this; | |
201 return result; | |
202 } | |
203 | |
204 const_iterator& operator--(); // Retreat (--iter) | |
205 const_iterator operator--(int) { // (iter--) | |
206 const_iterator result(*this); | |
207 --*this; | |
208 return result; | |
209 } | |
210 | |
211 // We love relational operators. | |
212 friend bool operator==(const CI& lhs, const CI& rhs) { | |
213 return lhs.it_ == rhs.it_; } | |
214 friend bool operator!=(const CI& lhs, const CI& rhs) { | |
215 return !(lhs == rhs); } | |
216 friend bool operator<(const CI& lhs, const CI& rhs); | |
217 friend bool operator>(const CI& lhs, const CI& rhs) { | |
218 return rhs < lhs; } | |
219 friend bool operator<=(const CI& lhs, const CI& rhs) { | |
220 return !(rhs < lhs); } | |
221 friend bool operator>=(const CI& lhs, const CI& rhs) { | |
222 return !(lhs < rhs); } | |
223 | |
224 friend difference_type distance(const CI& first, const CI& last); | |
225 | |
226 // UTF-8-specific methods | |
227 // Store the UTF-8 encoding of the current codepoint into buf, | |
228 // which must be at least 4 bytes long. Return the number of | |
229 // bytes written. | |
230 int get_utf8(char* buf) const; | |
231 // Return the iterator's pointer into the UTF-8 data. | |
232 const char* utf8_data() const { return it_; } | |
233 | |
234 string DebugString() const; | |
235 | |
236 private: | |
237 friend class UnicodeText; | |
238 friend class UnicodeTextUtils; | |
239 friend class UTF8StateTableProperty; | |
240 explicit const_iterator(const char* it) : it_(it) {} | |
241 | |
242 const char* it_; | |
243 }; | |
244 | |
245 const_iterator begin() const; | |
246 const_iterator end() const; | |
247 | |
248 class const_reverse_iterator : public std::reverse_iterator<const_iterator> { | |
249 public: | |
250 const_reverse_iterator(const_iterator it) : | |
251 std::reverse_iterator<const_iterator>(it) {} | |
252 const char* utf8_data() const { | |
253 const_iterator tmp_it = base(); | |
254 return (--tmp_it).utf8_data(); | |
255 } | |
256 int get_utf8(char* buf) const { | |
257 const_iterator tmp_it = base(); | |
258 return (--tmp_it).get_utf8(buf); | |
259 } | |
260 }; | |
261 const_reverse_iterator rbegin() const { | |
262 return const_reverse_iterator(end()); | |
263 } | |
264 const_reverse_iterator rend() const { | |
265 return const_reverse_iterator(begin()); | |
266 } | |
267 | |
268 // Substring searching. Returns the beginning of the first | |
269 // occurrence of "look", or end() if not found. | |
270 const_iterator find(const UnicodeText& look, const_iterator start_pos) const; | |
271 // Equivalent to find(look, begin()) | |
272 const_iterator find(const UnicodeText& look) const; | |
273 | |
274 // Returns whether this contains the character U+FFFD. This can | |
275 // occur, for example, if the input to Encodings::Decode() had byte | |
276 // sequences that were invalid in the source encoding. | |
277 bool HasReplacementChar() const; | |
278 | |
279 // UTF-8-specific methods | |
280 // | |
281 // Return the data, length, and capacity of UTF-8-encoded version of | |
282 // the text. Length and capacity are measured in bytes. | |
283 const char* utf8_data() const { return repr_.data_; } | |
284 int utf8_length() const { return repr_.size_; } | |
285 int utf8_capacity() const { return repr_.capacity_; } | |
286 | |
287 // Return the UTF-8 data as a string. | |
288 static string UTF8Substring(const const_iterator& first, | |
289 const const_iterator& last); | |
290 | |
291 // There are three methods for initializing a UnicodeText from UTF-8 | |
292 // data. They vary in details of memory management. In all cases, | |
293 // the data is tested for interchange-validity. If it is not | |
294 // interchange-valid, a LOG(WARNING) is issued, and each | |
295 // structurally invalid byte and each interchange-invalid codepoint | |
296 // is replaced with a space. | |
297 | |
298 // x.CopyUTF8(buf, len) copies buf into x. | |
299 UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); | |
300 | |
301 // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of | |
302 // buf. buf is not copied. | |
303 UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, | |
304 int byte_length, | |
305 int byte_capacity); | |
306 | |
307 // x.PointToUTF8(buf,len) changes x so that it points to buf | |
308 // ("becomes an alias"). It does not take ownership or copy buf. | |
309 // If the buffer is not valid, this has the same effect as | |
310 // CopyUTF8(utf8_buffer, byte_length). | |
311 UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); | |
312 | |
313 // Occasionally it is necessary to use functions that operate on the | |
314 // pointer returned by utf8_data(). MakeIterator(p) provides a way | |
315 // to get back to the UnicodeText level. It uses CHECK to ensure | |
316 // that p is a pointer within this object's UTF-8 data, and that it | |
317 // points to the beginning of a character. | |
318 const_iterator MakeIterator(const char* p) const; | |
319 | |
320 string DebugString() const; | |
321 | |
322 private: | |
323 friend class const_iterator; | |
324 friend class UnicodeTextUtils; | |
325 | |
326 class Repr { // A byte-string. | |
327 public: | |
328 char* data_; | |
329 int size_; | |
330 int capacity_; | |
331 bool ours_; // Do we own data_? | |
332 | |
333 Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} | |
334 ~Repr() { if (ours_) delete[] data_; } | |
335 | |
336 void clear(); | |
337 void reserve(int capacity); | |
338 void resize(int size); | |
339 | |
340 void append(const char* bytes, int byte_length); | |
341 void Copy(const char* data, int size); | |
342 void TakeOwnershipOf(char* data, int size, int capacity); | |
343 void PointTo(const char* data, int size); | |
344 | |
345 string DebugString() const; | |
346 | |
347 private: | |
348 Repr& operator=(const Repr&); | |
349 Repr(const Repr& other); | |
350 }; | |
351 | |
352 Repr repr_; | |
353 | |
354 // UTF-8-specific private methods. | |
355 // These routines do not perform a validity check when compiled | |
356 // in opt mode. | |
357 // It is an error to call these methods with UTF-8 data that | |
358 // is not interchange-valid. | |
359 // | |
360 UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); | |
361 UnicodeText& UnsafeTakeOwnershipOfUTF8( | |
362 char* utf8_buffer, int byte_length, int byte_capacity); | |
363 UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); | |
364 UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); | |
365 const_iterator UnsafeFind(const UnicodeText& look, | |
366 const_iterator start_pos) const; | |
367 }; | |
368 | |
369 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); | |
370 | |
371 inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { | |
372 return !(lhs == rhs); | |
373 } | |
374 | |
375 // UnicodeTextRange is a pair of iterators, useful for specifying text | |
376 // segments. If the iterators are ==, the segment is empty. | |
377 typedef pair<UnicodeText::const_iterator, | |
378 UnicodeText::const_iterator> UnicodeTextRange; | |
379 | |
380 inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { | |
381 return r.first == r.second; | |
382 } | |
383 | |
384 | |
385 // *************************** Utilities ************************* | |
386 | |
387 // A factory function for creating a UnicodeText from a buffer of | |
388 // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It | |
389 // is an "owner.") | |
390 // | |
391 // Each byte that is structurally invalid will be replaced with a | |
392 // space. Each codepoint that is interchange-invalid will also be | |
393 // replaced with a space, even if the codepoint was represented with a | |
394 // multibyte sequence in the UTF-8 data. | |
395 // | |
396 inline UnicodeText MakeUnicodeTextAcceptingOwnership( | |
397 char* utf8_buffer, int byte_length, int byte_capacity) { | |
398 return UnicodeText().TakeOwnershipOfUTF8( | |
399 utf8_buffer, byte_length, byte_capacity); | |
400 } | |
401 | |
402 // A factory function for creating a UnicodeText from a buffer of | |
403 // UTF-8 data. The new UnicodeText does not take ownership of the | |
404 // buffer. (It is an "alias.") | |
405 // | |
406 inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( | |
407 const char* utf8_buffer, int byte_length) { | |
408 return UnicodeText().PointToUTF8(utf8_buffer, byte_length); | |
409 } | |
410 | |
411 // Create a UnicodeText from a UTF-8 string or buffer. | |
412 // | |
413 // If do_copy is true, then a copy of the string is made. The copy is | |
414 // owned by the resulting UnicodeText object and will be freed when | |
415 // the object is destroyed. This UnicodeText object is referred to | |
416 // as an "owner." | |
417 // | |
418 // If do_copy is false, then no copy is made. The resulting | |
419 // UnicodeText object does NOT take ownership of the string; in this | |
420 // case, the lifetime of the UnicodeText object must not exceed the | |
421 // lifetime of the string. This Unicodetext object is referred to as | |
422 // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. | |
423 // | |
424 // If the input string does not contain valid UTF-8, then a copy is | |
425 // made (as if do_copy were true) and coerced to valid UTF-8 by | |
426 // replacing each invalid byte with a space. | |
427 // | |
428 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, | |
429 bool do_copy) { | |
430 UnicodeText t; | |
431 if (do_copy) { | |
432 t.CopyUTF8(utf8_buf, len); | |
433 } else { | |
434 t.PointToUTF8(utf8_buf, len); | |
435 } | |
436 return t; | |
437 } | |
438 | |
439 inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { | |
440 return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); | |
441 } | |
442 | |
443 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { | |
444 return UTF8ToUnicodeText(utf8_buf, len, true); | |
445 } | |
446 inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { | |
447 return UTF8ToUnicodeText(utf8_string, true); | |
448 } | |
449 | |
450 // Return a string containing the UTF-8 encoded version of all the | |
451 // Unicode characters in t. | |
452 inline string UnicodeTextToUTF8(const UnicodeText& t) { | |
453 return string(t.utf8_data(), t.utf8_length()); | |
454 } | |
455 | |
456 #endif // UTIL_UTF8_UNICODETEXT_H__ | |
OLD | NEW |