OLD | NEW |
(Empty) | |
| 1 // Copyright (C) 2006 Google Inc. |
| 2 // |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 // you may not use this file except in compliance with the License. |
| 5 // You may obtain a copy of the License at |
| 6 // |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 // |
| 9 // Unless required by applicable law or agreed to in writing, software |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 // See the License for the specific language governing permissions and |
| 13 // limitations under the License. |
| 14 |
| 15 // Author: Jim Meehan |
| 16 |
| 17 #ifndef UTIL_UTF8_UNICODETEXT_H__ |
| 18 #define UTIL_UTF8_UNICODETEXT_H__ |
| 19 |
| 20 #include <iterator> |
| 21 #include <string> |
| 22 #include <utility> |
| 23 #include "base/basictypes.h" |
| 24 //#include "util/utf8/public/config.h" |
| 25 |
| 26 using std::string; |
| 27 using std::bidirectional_iterator_tag; |
| 28 using std::pair; |
| 29 |
| 30 // ***************************** UnicodeText ************************** |
| 31 // |
| 32 // A UnicodeText object is a container for a sequence of Unicode |
| 33 // codepoint values. It has default, copy, and assignment constructors. |
| 34 // Data can be appended to it from another UnicodeText, from |
| 35 // iterators, or from a single codepoint. |
| 36 // |
| 37 // The internal representation of the text is UTF-8. Since UTF-8 is a |
| 38 // variable-width format, UnicodeText does not provide random access |
| 39 // to the text, and changes to the text are permitted only at the end. |
| 40 // |
| 41 // The UnicodeText class defines a const_iterator. The dereferencing |
| 42 // operator (*) returns a codepoint (char32). The iterator is a |
| 43 // bidirectional, read-only iterator. It becomes invalid if the text |
| 44 // is changed. |
| 45 // |
| 46 // There are methods for appending and retrieving UTF-8 data directly. |
| 47 // The 'utf8_data' method returns a const char* that contains the |
| 48 // UTF-8-encoded version of the text; 'utf8_length' returns the number |
| 49 // of bytes in the UTF-8 data. An iterator's 'get' method stores up to |
| 50 // 4 bytes of UTF-8 data in a char array and returns the number of |
| 51 // bytes that it stored. |
| 52 // |
| 53 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, |
| 54 // 0x10FFFF], but UnicodeText has the additional restriction that it |
| 55 // can contain only those characters that are valid for interchange on |
| 56 // the Web. This excludes all of the control codes except for carriage |
| 57 // return, line feed, and horizontal tab. It also excludes |
| 58 // non-characters, but codepoints that are in the Private Use regions |
| 59 // are allowed, as are codepoints that are unassigned. (See the |
| 60 // Unicode reference for details.) The function UniLib::IsInterchangeValid |
| 61 // can be used as a test for this property. |
| 62 // |
| 63 // UnicodeTexts are safe. Every method that constructs or modifies a |
| 64 // UnicodeText tests for interchange-validity, and will substitute a |
| 65 // space for the invalid data. Such cases are reported via |
| 66 // LOG(WARNING). |
| 67 // |
| 68 // MEMORY MANAGEMENT: copy, take ownership, or point to |
| 69 // |
| 70 // A UnicodeText is either an "owner", meaning that it owns the memory |
| 71 // for the data buffer and will free it when the UnicodeText is |
| 72 // destroyed, or it is an "alias", meaning that it does not. |
| 73 // |
| 74 // There are three methods for storing UTF-8 data in a UnicodeText: |
| 75 // |
| 76 // CopyUTF8(buffer, len) copies buffer. |
| 77 // |
| 78 // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. |
| 79 // |
| 80 // PointToUTF8(buffer, size) creates an alias pointing to buffer. |
| 81 // |
| 82 // All three methods perform a validity check on the buffer. There are |
| 83 // private, "unsafe" versions of these functions that bypass the |
| 84 // validity check. They are used internally and by friend-functions |
| 85 // that are handling UTF-8 data that has already been validated. |
| 86 // |
| 87 // The purpose of an alias is to avoid making an unnecessary copy of a |
| 88 // UTF-8 buffer while still providing access to the Unicode values |
| 89 // within that text through iterators or the fast scanners that are |
| 90 // based on UTF-8 state tables. The lifetime of an alias must not |
| 91 // exceed the lifetime of the buffer from which it was constructed. |
| 92 // |
| 93 // The semantics of an alias might be described as "copy on write or |
| 94 // repair." The source data is never modified. If push_back() or |
| 95 // append() is called on an alias, a copy of the data will be created, |
| 96 // and the UnicodeText will become an owner. If clear() is called on |
| 97 // an alias, it becomes an (empty) owner. |
| 98 // |
| 99 // The copy constructor and the assignment operator produce an owner. |
| 100 // That is, after direct initialization ("UnicodeText x(y);") or copy |
| 101 // initialization ("UnicodeText x = y;") x will be an owner, even if y |
| 102 // was an alias. The assignment operator ("x = y;") also produces an |
| 103 // owner unless x and y are the same object and y is an alias. |
| 104 // |
| 105 // Aliases should be used with care. If the source from which an alias |
| 106 // was created is freed, or if the contents are changed, while the |
| 107 // alias is still in use, fatal errors could result. But it can be |
| 108 // quite useful to have a UnicodeText "window" through which to see a |
| 109 // UTF-8 buffer without having to pay the price of making a copy. |
| 110 // |
| 111 // UTILITIES |
| 112 // |
| 113 // The interfaces in util/utf8/public/textutils.h provide higher-level |
| 114 // utilities for dealing with UnicodeTexts, including routines for |
| 115 // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or |
| 116 // strings, creating strings from UnicodeTexts, normalizing text for |
| 117 // efficient matching or display, and others. |
| 118 |
| 119 class UnicodeText { |
| 120 public: |
| 121 class const_iterator; |
| 122 |
| 123 typedef char32 value_type; |
| 124 |
| 125 // Constructors. These always produce owners. |
| 126 UnicodeText(); // Create an empty text. |
| 127 UnicodeText(const UnicodeText& src); // copy constructor |
| 128 // Construct a substring (copies the data). |
| 129 UnicodeText(const const_iterator& first, const const_iterator& last); |
| 130 |
| 131 // Assignment operator. This copies the data and produces an owner |
| 132 // unless this == &src, e.g., "x = x;", which is a no-op. |
| 133 UnicodeText& operator=(const UnicodeText& src); |
| 134 |
| 135 // x.Copy(y) copies the data from y into x. |
| 136 UnicodeText& Copy(const UnicodeText& src); |
| 137 inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } |
| 138 |
| 139 // x.PointTo(y) changes x so that it points to y's data. |
| 140 // It does not copy y or take ownership of y's data. |
| 141 UnicodeText& PointTo(const UnicodeText& src); |
| 142 UnicodeText& PointTo(const const_iterator& first, |
| 143 const const_iterator& last); |
| 144 |
| 145 ~UnicodeText(); |
| 146 |
| 147 void clear(); // Clear text. |
| 148 bool empty() { return repr_.size_ == 0; } // Test if text is empty. |
| 149 |
| 150 // Add a codepoint to the end of the text. |
| 151 // If the codepoint is not interchange-valid, add a space instead |
| 152 // and log a warning. |
| 153 void push_back(char32 codepoint); |
| 154 |
| 155 // Generic appending operation. |
| 156 // iterator_traits<ForwardIterator>::value_type must be implicitly |
| 157 // convertible to char32. Typical uses of this method might include: |
| 158 // char32 chars[] = {0x1, 0x2, ...}; |
| 159 // vector<char32> more_chars = ...; |
| 160 // utext.append(chars, chars+arraysize(chars)); |
| 161 // utext.append(more_chars.begin(), more_chars.end()); |
| 162 template<typename ForwardIterator> |
| 163 UnicodeText& append(ForwardIterator first, const ForwardIterator last) { |
| 164 while (first != last) { push_back(*first++); } |
| 165 return *this; |
| 166 } |
| 167 |
| 168 // A specialization of the generic append() method. |
| 169 UnicodeText& append(const const_iterator& first, const const_iterator& last); |
| 170 |
| 171 // An optimization of append(source.begin(), source.end()). |
| 172 UnicodeText& append(const UnicodeText& source); |
| 173 |
| 174 int size() const; // the number of Unicode characters (codepoints) |
| 175 |
| 176 friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); |
| 177 friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); |
| 178 |
| 179 class const_iterator { |
| 180 typedef const_iterator CI; |
| 181 public: |
| 182 typedef bidirectional_iterator_tag iterator_category; |
| 183 typedef char32 value_type; |
| 184 typedef ptrdiff_t difference_type; |
| 185 typedef void pointer; // (Not needed.) |
| 186 typedef const char32 reference; // (Needed for const_reverse_iterator) |
| 187 |
| 188 // Iterators are default-constructible. |
| 189 const_iterator(); |
| 190 |
| 191 // It's safe to make multiple passes over a UnicodeText. |
| 192 const_iterator(const const_iterator& other); |
| 193 const_iterator& operator=(const const_iterator& other); |
| 194 |
| 195 char32 operator*() const; // Dereference |
| 196 |
| 197 const_iterator& operator++(); // Advance (++iter) |
| 198 const_iterator operator++(int) { // (iter++) |
| 199 const_iterator result(*this); |
| 200 ++*this; |
| 201 return result; |
| 202 } |
| 203 |
| 204 const_iterator& operator--(); // Retreat (--iter) |
| 205 const_iterator operator--(int) { // (iter--) |
| 206 const_iterator result(*this); |
| 207 --*this; |
| 208 return result; |
| 209 } |
| 210 |
| 211 // We love relational operators. |
| 212 friend bool operator==(const CI& lhs, const CI& rhs) { |
| 213 return lhs.it_ == rhs.it_; } |
| 214 friend bool operator!=(const CI& lhs, const CI& rhs) { |
| 215 return !(lhs == rhs); } |
| 216 friend bool operator<(const CI& lhs, const CI& rhs); |
| 217 friend bool operator>(const CI& lhs, const CI& rhs) { |
| 218 return rhs < lhs; } |
| 219 friend bool operator<=(const CI& lhs, const CI& rhs) { |
| 220 return !(rhs < lhs); } |
| 221 friend bool operator>=(const CI& lhs, const CI& rhs) { |
| 222 return !(lhs < rhs); } |
| 223 |
| 224 friend difference_type distance(const CI& first, const CI& last); |
| 225 |
| 226 // UTF-8-specific methods |
| 227 // Store the UTF-8 encoding of the current codepoint into buf, |
| 228 // which must be at least 4 bytes long. Return the number of |
| 229 // bytes written. |
| 230 int get_utf8(char* buf) const; |
| 231 // Return the iterator's pointer into the UTF-8 data. |
| 232 const char* utf8_data() const { return it_; } |
| 233 |
| 234 string DebugString() const; |
| 235 |
| 236 private: |
| 237 friend class UnicodeText; |
| 238 friend class UnicodeTextUtils; |
| 239 friend class UTF8StateTableProperty; |
| 240 explicit const_iterator(const char* it) : it_(it) {} |
| 241 |
| 242 const char* it_; |
| 243 }; |
| 244 |
| 245 const_iterator begin() const; |
| 246 const_iterator end() const; |
| 247 |
| 248 class const_reverse_iterator : public std::reverse_iterator<const_iterator> { |
| 249 public: |
| 250 const_reverse_iterator(const_iterator it) : |
| 251 std::reverse_iterator<const_iterator>(it) {} |
| 252 const char* utf8_data() const { |
| 253 const_iterator tmp_it = base(); |
| 254 return (--tmp_it).utf8_data(); |
| 255 } |
| 256 int get_utf8(char* buf) const { |
| 257 const_iterator tmp_it = base(); |
| 258 return (--tmp_it).get_utf8(buf); |
| 259 } |
| 260 }; |
| 261 const_reverse_iterator rbegin() const { |
| 262 return const_reverse_iterator(end()); |
| 263 } |
| 264 const_reverse_iterator rend() const { |
| 265 return const_reverse_iterator(begin()); |
| 266 } |
| 267 |
| 268 // Substring searching. Returns the beginning of the first |
| 269 // occurrence of "look", or end() if not found. |
| 270 const_iterator find(const UnicodeText& look, const_iterator start_pos) const; |
| 271 // Equivalent to find(look, begin()) |
| 272 const_iterator find(const UnicodeText& look) const; |
| 273 |
| 274 // Returns whether this contains the character U+FFFD. This can |
| 275 // occur, for example, if the input to Encodings::Decode() had byte |
| 276 // sequences that were invalid in the source encoding. |
| 277 bool HasReplacementChar() const; |
| 278 |
| 279 // UTF-8-specific methods |
| 280 // |
| 281 // Return the data, length, and capacity of UTF-8-encoded version of |
| 282 // the text. Length and capacity are measured in bytes. |
| 283 const char* utf8_data() const { return repr_.data_; } |
| 284 int utf8_length() const { return repr_.size_; } |
| 285 int utf8_capacity() const { return repr_.capacity_; } |
| 286 |
| 287 // Return the UTF-8 data as a string. |
| 288 static string UTF8Substring(const const_iterator& first, |
| 289 const const_iterator& last); |
| 290 |
| 291 // There are three methods for initializing a UnicodeText from UTF-8 |
| 292 // data. They vary in details of memory management. In all cases, |
| 293 // the data is tested for interchange-validity. If it is not |
| 294 // interchange-valid, a LOG(WARNING) is issued, and each |
| 295 // structurally invalid byte and each interchange-invalid codepoint |
| 296 // is replaced with a space. |
| 297 |
| 298 // x.CopyUTF8(buf, len) copies buf into x. |
| 299 UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); |
| 300 |
| 301 // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of |
| 302 // buf. buf is not copied. |
| 303 UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, |
| 304 int byte_length, |
| 305 int byte_capacity); |
| 306 |
| 307 // x.PointToUTF8(buf,len) changes x so that it points to buf |
| 308 // ("becomes an alias"). It does not take ownership or copy buf. |
| 309 // If the buffer is not valid, this has the same effect as |
| 310 // CopyUTF8(utf8_buffer, byte_length). |
| 311 UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); |
| 312 |
| 313 // Occasionally it is necessary to use functions that operate on the |
| 314 // pointer returned by utf8_data(). MakeIterator(p) provides a way |
| 315 // to get back to the UnicodeText level. It uses CHECK to ensure |
| 316 // that p is a pointer within this object's UTF-8 data, and that it |
| 317 // points to the beginning of a character. |
| 318 const_iterator MakeIterator(const char* p) const; |
| 319 |
| 320 string DebugString() const; |
| 321 |
| 322 private: |
| 323 friend class const_iterator; |
| 324 friend class UnicodeTextUtils; |
| 325 |
| 326 class Repr { // A byte-string. |
| 327 public: |
| 328 char* data_; |
| 329 int size_; |
| 330 int capacity_; |
| 331 bool ours_; // Do we own data_? |
| 332 |
| 333 Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} |
| 334 ~Repr() { if (ours_) delete[] data_; } |
| 335 |
| 336 void clear(); |
| 337 void reserve(int capacity); |
| 338 void resize(int size); |
| 339 |
| 340 void append(const char* bytes, int byte_length); |
| 341 void Copy(const char* data, int size); |
| 342 void TakeOwnershipOf(char* data, int size, int capacity); |
| 343 void PointTo(const char* data, int size); |
| 344 |
| 345 string DebugString() const; |
| 346 |
| 347 private: |
| 348 Repr& operator=(const Repr&); |
| 349 Repr(const Repr& other); |
| 350 }; |
| 351 |
| 352 Repr repr_; |
| 353 |
| 354 // UTF-8-specific private methods. |
| 355 // These routines do not perform a validity check when compiled |
| 356 // in opt mode. |
| 357 // It is an error to call these methods with UTF-8 data that |
| 358 // is not interchange-valid. |
| 359 // |
| 360 UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); |
| 361 UnicodeText& UnsafeTakeOwnershipOfUTF8( |
| 362 char* utf8_buffer, int byte_length, int byte_capacity); |
| 363 UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); |
| 364 UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); |
| 365 const_iterator UnsafeFind(const UnicodeText& look, |
| 366 const_iterator start_pos) const; |
| 367 }; |
| 368 |
| 369 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); |
| 370 |
| 371 inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { |
| 372 return !(lhs == rhs); |
| 373 } |
| 374 |
| 375 // UnicodeTextRange is a pair of iterators, useful for specifying text |
| 376 // segments. If the iterators are ==, the segment is empty. |
| 377 typedef pair<UnicodeText::const_iterator, |
| 378 UnicodeText::const_iterator> UnicodeTextRange; |
| 379 |
| 380 inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { |
| 381 return r.first == r.second; |
| 382 } |
| 383 |
| 384 |
| 385 // *************************** Utilities ************************* |
| 386 |
| 387 // A factory function for creating a UnicodeText from a buffer of |
| 388 // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It |
| 389 // is an "owner.") |
| 390 // |
| 391 // Each byte that is structurally invalid will be replaced with a |
| 392 // space. Each codepoint that is interchange-invalid will also be |
| 393 // replaced with a space, even if the codepoint was represented with a |
| 394 // multibyte sequence in the UTF-8 data. |
| 395 // |
| 396 inline UnicodeText MakeUnicodeTextAcceptingOwnership( |
| 397 char* utf8_buffer, int byte_length, int byte_capacity) { |
| 398 return UnicodeText().TakeOwnershipOfUTF8( |
| 399 utf8_buffer, byte_length, byte_capacity); |
| 400 } |
| 401 |
| 402 // A factory function for creating a UnicodeText from a buffer of |
| 403 // UTF-8 data. The new UnicodeText does not take ownership of the |
| 404 // buffer. (It is an "alias.") |
| 405 // |
| 406 inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( |
| 407 const char* utf8_buffer, int byte_length) { |
| 408 return UnicodeText().PointToUTF8(utf8_buffer, byte_length); |
| 409 } |
| 410 |
| 411 // Create a UnicodeText from a UTF-8 string or buffer. |
| 412 // |
| 413 // If do_copy is true, then a copy of the string is made. The copy is |
| 414 // owned by the resulting UnicodeText object and will be freed when |
| 415 // the object is destroyed. This UnicodeText object is referred to |
| 416 // as an "owner." |
| 417 // |
| 418 // If do_copy is false, then no copy is made. The resulting |
| 419 // UnicodeText object does NOT take ownership of the string; in this |
| 420 // case, the lifetime of the UnicodeText object must not exceed the |
| 421 // lifetime of the string. This Unicodetext object is referred to as |
| 422 // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. |
| 423 // |
| 424 // If the input string does not contain valid UTF-8, then a copy is |
| 425 // made (as if do_copy were true) and coerced to valid UTF-8 by |
| 426 // replacing each invalid byte with a space. |
| 427 // |
| 428 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, |
| 429 bool do_copy) { |
| 430 UnicodeText t; |
| 431 if (do_copy) { |
| 432 t.CopyUTF8(utf8_buf, len); |
| 433 } else { |
| 434 t.PointToUTF8(utf8_buf, len); |
| 435 } |
| 436 return t; |
| 437 } |
| 438 |
| 439 inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { |
| 440 return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); |
| 441 } |
| 442 |
| 443 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { |
| 444 return UTF8ToUnicodeText(utf8_buf, len, true); |
| 445 } |
| 446 inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { |
| 447 return UTF8ToUnicodeText(utf8_string, true); |
| 448 } |
| 449 |
| 450 // Return a string containing the UTF-8 encoded version of all the |
| 451 // Unicode characters in t. |
| 452 inline string UnicodeTextToUTF8(const UnicodeText& t) { |
| 453 return string(t.utf8_data(), t.utf8_length()); |
| 454 } |
| 455 |
| 456 #endif // UTIL_UTF8_UNICODETEXT_H__ |
OLD | NEW |