OLD | NEW |
(Empty) | |
| 1 // Copyright (C) 2006 Google Inc. |
| 2 // |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 // you may not use this file except in compliance with the License. |
| 5 // You may obtain a copy of the License at |
| 6 // |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 // |
| 9 // Unless required by applicable law or agreed to in writing, software |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 // See the License for the specific language governing permissions and |
| 13 // limitations under the License. |
| 14 |
| 15 // Author: Jim Meehan |
| 16 |
| 17 #include <iostream> |
| 18 #include <sstream> |
| 19 #include <cassert> |
| 20 |
| 21 #include "utf/unicodetext.h" |
| 22 //#include "base/logging.h" |
| 23 #include "utf/stringpiece.h" |
| 24 //#include "utf/stringprintf.h" |
| 25 #include "utf/utf.h" |
| 26 #include "utf/unilib.h" |
| 27 |
| 28 using std::stringstream; |
| 29 using std::max; |
| 30 using std::hex; |
| 31 using std::dec; |
| 32 using std::cerr; |
| 33 using std::endl; |
| 34 |
| 35 static int CodepointDistance(const char* start, const char* end) { |
| 36 int n = 0; |
| 37 // Increment n on every non-trail-byte. |
| 38 for (const char* p = start; p < end; ++p) { |
| 39 n += (*reinterpret_cast<const signed char*>(p) >= -0x40); |
| 40 } |
| 41 return n; |
| 42 } |
| 43 |
| 44 static int CodepointCount(const char* utf8, int len) { |
| 45 return CodepointDistance(utf8, utf8 + len); |
| 46 } |
| 47 |
| 48 UnicodeText::const_iterator::difference_type |
| 49 distance(const UnicodeText::const_iterator& first, |
| 50 const UnicodeText::const_iterator& last) { |
| 51 return CodepointDistance(first.it_, last.it_); |
| 52 } |
| 53 |
| 54 // ---------- Utility ---------- |
| 55 |
| 56 static int ConvertToInterchangeValid(char* start, int len) { |
| 57 // This routine is called only when we've discovered that a UTF-8 buffer |
| 58 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 |
| 59 // was not interchange valid. This indicates a bug in the caller, and |
| 60 // a LOG(WARNING) is done in that case. |
| 61 // This is similar to CoerceToInterchangeValid, but it replaces each |
| 62 // structurally valid byte with a space, and each non-interchange |
| 63 // character with a space, even when that character requires more |
| 64 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is |
| 65 // structurally valid UTF8, but U+FDD0 is not an interchange-valid |
| 66 // code point. The result should contain one space, not three. |
| 67 // |
| 68 // Since the conversion never needs to write more data than it |
| 69 // reads, it is safe to change the buffer in place. It returns the |
| 70 // number of bytes written. |
| 71 char* const in = start; |
| 72 char* out = start; |
| 73 char* const end = start + len; |
| 74 while (start < end) { |
| 75 int good = UniLib::SpanInterchangeValid(start, end - start); |
| 76 if (good > 0) { |
| 77 if (out != start) { |
| 78 memmove(out, start, good); |
| 79 } |
| 80 out += good; |
| 81 start += good; |
| 82 if (start == end) { |
| 83 break; |
| 84 } |
| 85 } |
| 86 // Is the current string invalid UTF8 or just non-interchange UTF8? |
| 87 char32 rune; |
| 88 int n; |
| 89 if (isvalidcharntorune(start, end - start, &rune, &n)) { |
| 90 // structurally valid UTF8, but not interchange valid |
| 91 start += n; // Skip over the whole character. |
| 92 } else { // bad UTF8 |
| 93 start += 1; // Skip over just one byte |
| 94 } |
| 95 *out++ = ' '; |
| 96 } |
| 97 return out - in; |
| 98 } |
| 99 |
| 100 |
| 101 // *************** Data representation ********** |
| 102 |
| 103 // Note: the copy constructor is undefined. |
| 104 |
| 105 // After reserve(), resize(), or clear(), we're an owner, not an alias. |
| 106 |
| 107 void UnicodeText::Repr::reserve(int new_capacity) { |
| 108 // If there's already enough capacity, and we're an owner, do nothing. |
| 109 if (capacity_ >= new_capacity && ours_) return; |
| 110 |
| 111 // Otherwise, allocate a new buffer. |
| 112 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20); |
| 113 char* new_data = new char[capacity_]; |
| 114 |
| 115 // If there is an old buffer, copy it into the new buffer. |
| 116 if (data_) { |
| 117 memcpy(new_data, data_, size_); |
| 118 if (ours_) delete[] data_; // If we owned the old buffer, free it. |
| 119 } |
| 120 data_ = new_data; |
| 121 ours_ = true; // We own the new buffer. |
| 122 // size_ is unchanged. |
| 123 } |
| 124 |
| 125 void UnicodeText::Repr::resize(int new_size) { |
| 126 if (new_size == 0) { |
| 127 clear(); |
| 128 } else { |
| 129 if (!ours_ || new_size > capacity_) reserve(new_size); |
| 130 // Clear the memory in the expanded part. |
| 131 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); |
| 132 size_ = new_size; |
| 133 ours_ = true; |
| 134 } |
| 135 } |
| 136 |
| 137 // This implementation of clear() deallocates the buffer if we're an owner. |
| 138 // That's not strictly necessary; we could just set size_ to 0. |
| 139 void UnicodeText::Repr::clear() { |
| 140 if (ours_) delete[] data_; |
| 141 data_ = NULL; |
| 142 size_ = capacity_ = 0; |
| 143 ours_ = true; |
| 144 } |
| 145 |
| 146 void UnicodeText::Repr::Copy(const char* data, int size) { |
| 147 resize(size); |
| 148 memcpy(data_, data, size); |
| 149 } |
| 150 |
| 151 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { |
| 152 if (data == data_) return; // We already own this memory. (Weird case.) |
| 153 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. |
| 154 data_ = data; |
| 155 size_ = size; |
| 156 capacity_ = capacity; |
| 157 ours_ = true; |
| 158 } |
| 159 |
| 160 void UnicodeText::Repr::PointTo(const char* data, int size) { |
| 161 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. |
| 162 data_ = const_cast<char*>(data); |
| 163 size_ = size; |
| 164 capacity_ = size; |
| 165 ours_ = false; |
| 166 } |
| 167 |
| 168 void UnicodeText::Repr::append(const char* bytes, int byte_length) { |
| 169 reserve(size_ + byte_length); |
| 170 memcpy(data_ + size_, bytes, byte_length); |
| 171 size_ += byte_length; |
| 172 } |
| 173 |
| 174 string UnicodeText::Repr::DebugString() const { |
| 175 stringstream ss; |
| 176 |
| 177 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec |
| 178 << size_ << " capacity=" << capacity_ << " " |
| 179 << (ours_ ? "Owned" : "Alias") << "}"; |
| 180 |
| 181 string result; |
| 182 ss >> result; |
| 183 |
| 184 return result; |
| 185 } |
| 186 |
| 187 |
| 188 |
| 189 // *************** UnicodeText ****************** |
| 190 |
| 191 // ----- Constructors ----- |
| 192 |
| 193 // Default constructor |
| 194 UnicodeText::UnicodeText() { |
| 195 } |
| 196 |
| 197 // Copy constructor |
| 198 UnicodeText::UnicodeText(const UnicodeText& src) { |
| 199 Copy(src); |
| 200 } |
| 201 |
| 202 // Substring constructor |
| 203 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, |
| 204 const UnicodeText::const_iterator& last) { |
| 205 assert(first <= last && "Incompatible iterators"); |
| 206 repr_.append(first.it_, last.it_ - first.it_); |
| 207 } |
| 208 |
| 209 string UnicodeText::UTF8Substring(const const_iterator& first, |
| 210 const const_iterator& last) { |
| 211 assert(first <= last && "Incompatible iterators"); |
| 212 return string(first.it_, last.it_ - first.it_); |
| 213 } |
| 214 |
| 215 |
| 216 // ----- Copy ----- |
| 217 |
| 218 UnicodeText& UnicodeText::operator=(const UnicodeText& src) { |
| 219 if (this != &src) { |
| 220 Copy(src); |
| 221 } |
| 222 return *this; |
| 223 } |
| 224 |
| 225 UnicodeText& UnicodeText::Copy(const UnicodeText& src) { |
| 226 repr_.Copy(src.repr_.data_, src.repr_.size_); |
| 227 return *this; |
| 228 } |
| 229 |
| 230 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { |
| 231 repr_.Copy(buffer, byte_length); |
| 232 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { |
| 233 cerr << "UTF-8 buffer is not interchange-valid." << endl; |
| 234 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); |
| 235 } |
| 236 return *this; |
| 237 } |
| 238 |
| 239 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, |
| 240 int byte_length) { |
| 241 repr_.Copy(buffer, byte_length); |
| 242 return *this; |
| 243 } |
| 244 |
| 245 // ----- TakeOwnershipOf ----- |
| 246 |
| 247 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, |
| 248 int byte_length, |
| 249 int byte_capacity) { |
| 250 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); |
| 251 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { |
| 252 cerr << "UTF-8 buffer is not interchange-valid." << endl; |
| 253 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); |
| 254 } |
| 255 return *this; |
| 256 } |
| 257 |
| 258 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, |
| 259 int byte_length, |
| 260 int byte_capacity) { |
| 261 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); |
| 262 return *this; |
| 263 } |
| 264 |
| 265 // ----- PointTo ----- |
| 266 |
| 267 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { |
| 268 if (UniLib:: IsInterchangeValid(buffer, byte_length)) { |
| 269 repr_.PointTo(buffer, byte_length); |
| 270 } else { |
| 271 cerr << "UTF-8 buffer is not interchange-valid." << endl; |
| 272 repr_.Copy(buffer, byte_length); |
| 273 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); |
| 274 } |
| 275 return *this; |
| 276 } |
| 277 |
| 278 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, |
| 279 int byte_length) { |
| 280 repr_.PointTo(buffer, byte_length); |
| 281 return *this; |
| 282 } |
| 283 |
| 284 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { |
| 285 repr_.PointTo(src.repr_.data_, src.repr_.size_); |
| 286 return *this; |
| 287 } |
| 288 |
| 289 UnicodeText& UnicodeText::PointTo(const const_iterator &first, |
| 290 const const_iterator &last) { |
| 291 assert(first <= last && " Incompatible iterators"); |
| 292 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); |
| 293 return *this; |
| 294 } |
| 295 |
| 296 // ----- Append ----- |
| 297 |
| 298 UnicodeText& UnicodeText::append(const UnicodeText& u) { |
| 299 repr_.append(u.repr_.data_, u.repr_.size_); |
| 300 return *this; |
| 301 } |
| 302 |
| 303 UnicodeText& UnicodeText::append(const const_iterator& first, |
| 304 const const_iterator& last) { |
| 305 assert(first <= last && "Incompatible iterators"); |
| 306 repr_.append(first.it_, last.it_ - first.it_); |
| 307 return *this; |
| 308 } |
| 309 |
| 310 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { |
| 311 repr_.append(utf8, len); |
| 312 return *this; |
| 313 } |
| 314 |
| 315 // ----- substring searching ----- |
| 316 |
| 317 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, |
| 318 const_iterator start_pos) const { |
| 319 assert(start_pos.utf8_data() >= utf8_data()); |
| 320 assert(start_pos.utf8_data() <= utf8_data() + utf8_length()); |
| 321 return UnsafeFind(look, start_pos); |
| 322 } |
| 323 |
| 324 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { |
| 325 return UnsafeFind(look, begin()); |
| 326 } |
| 327 |
| 328 UnicodeText::const_iterator UnicodeText::UnsafeFind( |
| 329 const UnicodeText& look, const_iterator start_pos) const { |
| 330 // Due to the magic of the UTF8 encoding, searching for a sequence of |
| 331 // letters is equivalent to substring search. |
| 332 StringPiece searching(utf8_data(), utf8_length()); |
| 333 StringPiece look_piece(look.utf8_data(), look.utf8_length()); |
| 334 StringPiece::size_type found = |
| 335 searching.find(look_piece, start_pos.utf8_data() - utf8_data()); |
| 336 if (found == StringPiece::npos) return end(); |
| 337 return const_iterator(utf8_data() + found); |
| 338 } |
| 339 |
| 340 bool UnicodeText::HasReplacementChar() const { |
| 341 // Equivalent to: |
| 342 // UnicodeText replacement_char; |
| 343 // replacement_char.push_back(0xFFFD); |
| 344 // return find(replacement_char) != end(); |
| 345 StringPiece searching(utf8_data(), utf8_length()); |
| 346 StringPiece looking_for("\xEF\xBF\xBD", 3); |
| 347 return searching.find(looking_for) != StringPiece::npos; |
| 348 } |
| 349 |
| 350 // ----- other methods ----- |
| 351 |
| 352 // Clear operator |
| 353 void UnicodeText::clear() { |
| 354 repr_.clear(); |
| 355 } |
| 356 |
| 357 // Destructor |
| 358 UnicodeText::~UnicodeText() {} |
| 359 |
| 360 |
| 361 void UnicodeText::push_back(char32 c) { |
| 362 if (UniLib::IsValidCodepoint(c)) { |
| 363 char buf[UTFmax]; |
| 364 int len = runetochar(buf, &c); |
| 365 if (UniLib::IsInterchangeValid(buf, len)) { |
| 366 repr_.append(buf, len); |
| 367 } else { |
| 368 cerr << "Unicode value 0x" << hex << c |
| 369 << " is not valid for interchange" << endl; |
| 370 repr_.append(" ", 1); |
| 371 } |
| 372 } else { |
| 373 cerr << "Illegal Unicode value: 0x" << hex << c << endl; |
| 374 repr_.append(" ", 1); |
| 375 } |
| 376 } |
| 377 |
| 378 int UnicodeText::size() const { |
| 379 return CodepointCount(repr_.data_, repr_.size_); |
| 380 } |
| 381 |
| 382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { |
| 383 if (&lhs == &rhs) return true; |
| 384 if (lhs.repr_.size_ != rhs.repr_.size_) return false; |
| 385 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; |
| 386 } |
| 387 |
| 388 string UnicodeText::DebugString() const { |
| 389 stringstream ss; |
| 390 |
| 391 ss << "{UnicodeText " << hex << this << dec << " chars=" |
| 392 << size() << " repr=" << repr_.DebugString() << "}"; |
| 393 #if 0 |
| 394 return StringPrintf("{UnicodeText %p chars=%d repr=%s}", |
| 395 this, |
| 396 size(), |
| 397 repr_.DebugString().c_str()); |
| 398 #endif |
| 399 string result; |
| 400 ss >> result; |
| 401 |
| 402 return result; |
| 403 } |
| 404 |
| 405 |
| 406 // ******************* UnicodeText::const_iterator ********************* |
| 407 |
| 408 // The implementation of const_iterator would be nicer if it |
| 409 // inherited from boost::iterator_facade |
| 410 // (http://boost.org/libs/iterator/doc/iterator_facade.html). |
| 411 |
| 412 UnicodeText::const_iterator::const_iterator() : it_(0) {} |
| 413 |
| 414 UnicodeText::const_iterator::const_iterator(const const_iterator& other) |
| 415 : it_(other.it_) { |
| 416 } |
| 417 |
| 418 UnicodeText::const_iterator& |
| 419 UnicodeText::const_iterator::operator=(const const_iterator& other) { |
| 420 if (&other != this) |
| 421 it_ = other.it_; |
| 422 return *this; |
| 423 } |
| 424 |
| 425 UnicodeText::const_iterator UnicodeText::begin() const { |
| 426 return const_iterator(repr_.data_); |
| 427 } |
| 428 |
| 429 UnicodeText::const_iterator UnicodeText::end() const { |
| 430 return const_iterator(repr_.data_ + repr_.size_); |
| 431 } |
| 432 |
| 433 bool operator<(const UnicodeText::const_iterator& lhs, |
| 434 const UnicodeText::const_iterator& rhs) { |
| 435 return lhs.it_ < rhs.it_; |
| 436 } |
| 437 |
| 438 char32 UnicodeText::const_iterator::operator*() const { |
| 439 // (We could call chartorune here, but that does some |
| 440 // error-checking, and we're guaranteed that our data is valid |
| 441 // UTF-8. Also, we expect this routine to be called very often. So |
| 442 // for speed, we do the calculation ourselves.) |
| 443 |
| 444 // Convert from UTF-8 |
| 445 int byte1 = it_[0]; |
| 446 if (byte1 < 0x80) |
| 447 return byte1; |
| 448 |
| 449 int byte2 = it_[1]; |
| 450 if (byte1 < 0xE0) |
| 451 return ((byte1 & 0x1F) << 6) |
| 452 | (byte2 & 0x3F); |
| 453 |
| 454 int byte3 = it_[2]; |
| 455 if (byte1 < 0xF0) |
| 456 return ((byte1 & 0x0F) << 12) |
| 457 | ((byte2 & 0x3F) << 6) |
| 458 | (byte3 & 0x3F); |
| 459 |
| 460 int byte4 = it_[3]; |
| 461 return ((byte1 & 0x07) << 18) |
| 462 | ((byte2 & 0x3F) << 12) |
| 463 | ((byte3 & 0x3F) << 6) |
| 464 | (byte4 & 0x3F); |
| 465 } |
| 466 |
| 467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { |
| 468 it_ += UniLib::OneCharLen(it_); |
| 469 return *this; |
| 470 } |
| 471 |
| 472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { |
| 473 while (UniLib::IsTrailByte(*--it_)); |
| 474 return *this; |
| 475 } |
| 476 |
| 477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { |
| 478 utf8_output[0] = it_[0]; |
| 479 if (static_cast<unsigned char>(it_[0]) < 0x80) |
| 480 return 1; |
| 481 |
| 482 utf8_output[1] = it_[1]; |
| 483 if (static_cast<unsigned char>(it_[0]) < 0xE0) |
| 484 return 2; |
| 485 |
| 486 utf8_output[2] = it_[2]; |
| 487 if (static_cast<unsigned char>(it_[0]) < 0xF0) |
| 488 return 3; |
| 489 |
| 490 utf8_output[3] = it_[3]; |
| 491 return 4; |
| 492 } |
| 493 |
| 494 |
| 495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { |
| 496 assert(p != NULL); |
| 497 const char* start = utf8_data(); |
| 498 int len = utf8_length(); |
| 499 const char* end = start + len; |
| 500 assert(p >= start); |
| 501 assert(p <= end); |
| 502 assert(p == end || !UniLib::IsTrailByte(*p)); |
| 503 return const_iterator(p); |
| 504 } |
| 505 |
| 506 string UnicodeText::const_iterator::DebugString() const { |
| 507 stringstream ss; |
| 508 |
| 509 ss << "{iter " << hex << it_ << "}"; |
| 510 string result; |
| 511 ss >> result; |
| 512 |
| 513 return result; |
| 514 } |
| 515 |
OLD | NEW |