Index: third_party/libphonenumber/cpp/src/utf/unicodetext.cc |
=================================================================== |
--- third_party/libphonenumber/cpp/src/utf/unicodetext.cc (revision 0) |
+++ third_party/libphonenumber/cpp/src/utf/unicodetext.cc (revision 0) |
@@ -0,0 +1,515 @@ |
+// Copyright (C) 2006 Google Inc. |
+// |
+// Licensed under the Apache License, Version 2.0 (the "License"); |
+// you may not use this file except in compliance with the License. |
+// You may obtain a copy of the License at |
+// |
+// http://www.apache.org/licenses/LICENSE-2.0 |
+// |
+// Unless required by applicable law or agreed to in writing, software |
+// distributed under the License is distributed on an "AS IS" BASIS, |
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
+// See the License for the specific language governing permissions and |
+// limitations under the License. |
+ |
+// Author: Jim Meehan |
+ |
+#include <iostream> |
+#include <sstream> |
+#include <cassert> |
+ |
+#include "utf/unicodetext.h" |
+//#include "base/logging.h" |
+#include "utf/stringpiece.h" |
+//#include "utf/stringprintf.h" |
+#include "utf/utf.h" |
+#include "utf/unilib.h" |
+ |
+using std::stringstream; |
+using std::max; |
+using std::hex; |
+using std::dec; |
+using std::cerr; |
+using std::endl; |
+ |
+static int CodepointDistance(const char* start, const char* end) { |
+ int n = 0; |
+ // Increment n on every non-trail-byte. |
+ for (const char* p = start; p < end; ++p) { |
+ n += (*reinterpret_cast<const signed char*>(p) >= -0x40); |
+ } |
+ return n; |
+} |
+ |
+static int CodepointCount(const char* utf8, int len) { |
+ return CodepointDistance(utf8, utf8 + len); |
+} |
+ |
+UnicodeText::const_iterator::difference_type |
+distance(const UnicodeText::const_iterator& first, |
+ const UnicodeText::const_iterator& last) { |
+ return CodepointDistance(first.it_, last.it_); |
+} |
+ |
+// ---------- Utility ---------- |
+ |
+static int ConvertToInterchangeValid(char* start, int len) { |
+ // This routine is called only when we've discovered that a UTF-8 buffer |
+ // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 |
+ // was not interchange valid. This indicates a bug in the caller, and |
+ // a LOG(WARNING) is done in that case. |
+ // This is similar to CoerceToInterchangeValid, but it replaces each |
+ // structurally valid byte with a space, and each non-interchange |
+ // character with a space, even when that character requires more |
+ // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is |
+ // structurally valid UTF8, but U+FDD0 is not an interchange-valid |
+ // code point. The result should contain one space, not three. |
+ // |
+ // Since the conversion never needs to write more data than it |
+ // reads, it is safe to change the buffer in place. It returns the |
+ // number of bytes written. |
+ char* const in = start; |
+ char* out = start; |
+ char* const end = start + len; |
+ while (start < end) { |
+ int good = UniLib::SpanInterchangeValid(start, end - start); |
+ if (good > 0) { |
+ if (out != start) { |
+ memmove(out, start, good); |
+ } |
+ out += good; |
+ start += good; |
+ if (start == end) { |
+ break; |
+ } |
+ } |
+ // Is the current string invalid UTF8 or just non-interchange UTF8? |
+ char32 rune; |
+ int n; |
+ if (isvalidcharntorune(start, end - start, &rune, &n)) { |
+ // structurally valid UTF8, but not interchange valid |
+ start += n; // Skip over the whole character. |
+ } else { // bad UTF8 |
+ start += 1; // Skip over just one byte |
+ } |
+ *out++ = ' '; |
+ } |
+ return out - in; |
+} |
+ |
+ |
+// *************** Data representation ********** |
+ |
+// Note: the copy constructor is undefined. |
+ |
+// After reserve(), resize(), or clear(), we're an owner, not an alias. |
+ |
+void UnicodeText::Repr::reserve(int new_capacity) { |
+ // If there's already enough capacity, and we're an owner, do nothing. |
+ if (capacity_ >= new_capacity && ours_) return; |
+ |
+ // Otherwise, allocate a new buffer. |
+ capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20); |
+ char* new_data = new char[capacity_]; |
+ |
+ // If there is an old buffer, copy it into the new buffer. |
+ if (data_) { |
+ memcpy(new_data, data_, size_); |
+ if (ours_) delete[] data_; // If we owned the old buffer, free it. |
+ } |
+ data_ = new_data; |
+ ours_ = true; // We own the new buffer. |
+ // size_ is unchanged. |
+} |
+ |
+void UnicodeText::Repr::resize(int new_size) { |
+ if (new_size == 0) { |
+ clear(); |
+ } else { |
+ if (!ours_ || new_size > capacity_) reserve(new_size); |
+ // Clear the memory in the expanded part. |
+ if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); |
+ size_ = new_size; |
+ ours_ = true; |
+ } |
+} |
+ |
+// This implementation of clear() deallocates the buffer if we're an owner. |
+// That's not strictly necessary; we could just set size_ to 0. |
+void UnicodeText::Repr::clear() { |
+ if (ours_) delete[] data_; |
+ data_ = NULL; |
+ size_ = capacity_ = 0; |
+ ours_ = true; |
+} |
+ |
+void UnicodeText::Repr::Copy(const char* data, int size) { |
+ resize(size); |
+ memcpy(data_, data, size); |
+} |
+ |
+void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { |
+ if (data == data_) return; // We already own this memory. (Weird case.) |
+ if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. |
+ data_ = data; |
+ size_ = size; |
+ capacity_ = capacity; |
+ ours_ = true; |
+} |
+ |
+void UnicodeText::Repr::PointTo(const char* data, int size) { |
+ if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. |
+ data_ = const_cast<char*>(data); |
+ size_ = size; |
+ capacity_ = size; |
+ ours_ = false; |
+} |
+ |
+void UnicodeText::Repr::append(const char* bytes, int byte_length) { |
+ reserve(size_ + byte_length); |
+ memcpy(data_ + size_, bytes, byte_length); |
+ size_ += byte_length; |
+} |
+ |
+string UnicodeText::Repr::DebugString() const { |
+ stringstream ss; |
+ |
+ ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec |
+ << size_ << " capacity=" << capacity_ << " " |
+ << (ours_ ? "Owned" : "Alias") << "}"; |
+ |
+ string result; |
+ ss >> result; |
+ |
+ return result; |
+} |
+ |
+ |
+ |
+// *************** UnicodeText ****************** |
+ |
+// ----- Constructors ----- |
+ |
+// Default constructor |
+UnicodeText::UnicodeText() { |
+} |
+ |
+// Copy constructor |
+UnicodeText::UnicodeText(const UnicodeText& src) { |
+ Copy(src); |
+} |
+ |
+// Substring constructor |
+UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, |
+ const UnicodeText::const_iterator& last) { |
+ assert(first <= last && "Incompatible iterators"); |
+ repr_.append(first.it_, last.it_ - first.it_); |
+} |
+ |
+string UnicodeText::UTF8Substring(const const_iterator& first, |
+ const const_iterator& last) { |
+ assert(first <= last && "Incompatible iterators"); |
+ return string(first.it_, last.it_ - first.it_); |
+} |
+ |
+ |
+// ----- Copy ----- |
+ |
+UnicodeText& UnicodeText::operator=(const UnicodeText& src) { |
+ if (this != &src) { |
+ Copy(src); |
+ } |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::Copy(const UnicodeText& src) { |
+ repr_.Copy(src.repr_.data_, src.repr_.size_); |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { |
+ repr_.Copy(buffer, byte_length); |
+ if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { |
+ cerr << "UTF-8 buffer is not interchange-valid." << endl; |
+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); |
+ } |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, |
+ int byte_length) { |
+ repr_.Copy(buffer, byte_length); |
+ return *this; |
+} |
+ |
+// ----- TakeOwnershipOf ----- |
+ |
+UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, |
+ int byte_length, |
+ int byte_capacity) { |
+ repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); |
+ if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { |
+ cerr << "UTF-8 buffer is not interchange-valid." << endl; |
+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); |
+ } |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, |
+ int byte_length, |
+ int byte_capacity) { |
+ repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); |
+ return *this; |
+} |
+ |
+// ----- PointTo ----- |
+ |
+UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { |
+ if (UniLib:: IsInterchangeValid(buffer, byte_length)) { |
+ repr_.PointTo(buffer, byte_length); |
+ } else { |
+ cerr << "UTF-8 buffer is not interchange-valid." << endl; |
+ repr_.Copy(buffer, byte_length); |
+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); |
+ } |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, |
+ int byte_length) { |
+ repr_.PointTo(buffer, byte_length); |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { |
+ repr_.PointTo(src.repr_.data_, src.repr_.size_); |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::PointTo(const const_iterator &first, |
+ const const_iterator &last) { |
+ assert(first <= last && " Incompatible iterators"); |
+ repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); |
+ return *this; |
+} |
+ |
+// ----- Append ----- |
+ |
+UnicodeText& UnicodeText::append(const UnicodeText& u) { |
+ repr_.append(u.repr_.data_, u.repr_.size_); |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::append(const const_iterator& first, |
+ const const_iterator& last) { |
+ assert(first <= last && "Incompatible iterators"); |
+ repr_.append(first.it_, last.it_ - first.it_); |
+ return *this; |
+} |
+ |
+UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { |
+ repr_.append(utf8, len); |
+ return *this; |
+} |
+ |
+// ----- substring searching ----- |
+ |
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, |
+ const_iterator start_pos) const { |
+ assert(start_pos.utf8_data() >= utf8_data()); |
+ assert(start_pos.utf8_data() <= utf8_data() + utf8_length()); |
+ return UnsafeFind(look, start_pos); |
+} |
+ |
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { |
+ return UnsafeFind(look, begin()); |
+} |
+ |
+UnicodeText::const_iterator UnicodeText::UnsafeFind( |
+ const UnicodeText& look, const_iterator start_pos) const { |
+ // Due to the magic of the UTF8 encoding, searching for a sequence of |
+ // letters is equivalent to substring search. |
+ StringPiece searching(utf8_data(), utf8_length()); |
+ StringPiece look_piece(look.utf8_data(), look.utf8_length()); |
+ StringPiece::size_type found = |
+ searching.find(look_piece, start_pos.utf8_data() - utf8_data()); |
+ if (found == StringPiece::npos) return end(); |
+ return const_iterator(utf8_data() + found); |
+} |
+ |
+bool UnicodeText::HasReplacementChar() const { |
+ // Equivalent to: |
+ // UnicodeText replacement_char; |
+ // replacement_char.push_back(0xFFFD); |
+ // return find(replacement_char) != end(); |
+ StringPiece searching(utf8_data(), utf8_length()); |
+ StringPiece looking_for("\xEF\xBF\xBD", 3); |
+ return searching.find(looking_for) != StringPiece::npos; |
+} |
+ |
+// ----- other methods ----- |
+ |
+// Clear operator |
+void UnicodeText::clear() { |
+ repr_.clear(); |
+} |
+ |
+// Destructor |
+UnicodeText::~UnicodeText() {} |
+ |
+ |
+void UnicodeText::push_back(char32 c) { |
+ if (UniLib::IsValidCodepoint(c)) { |
+ char buf[UTFmax]; |
+ int len = runetochar(buf, &c); |
+ if (UniLib::IsInterchangeValid(buf, len)) { |
+ repr_.append(buf, len); |
+ } else { |
+ cerr << "Unicode value 0x" << hex << c |
+ << " is not valid for interchange" << endl; |
+ repr_.append(" ", 1); |
+ } |
+ } else { |
+ cerr << "Illegal Unicode value: 0x" << hex << c << endl; |
+ repr_.append(" ", 1); |
+ } |
+} |
+ |
+int UnicodeText::size() const { |
+ return CodepointCount(repr_.data_, repr_.size_); |
+} |
+ |
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { |
+ if (&lhs == &rhs) return true; |
+ if (lhs.repr_.size_ != rhs.repr_.size_) return false; |
+ return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; |
+} |
+ |
+string UnicodeText::DebugString() const { |
+ stringstream ss; |
+ |
+ ss << "{UnicodeText " << hex << this << dec << " chars=" |
+ << size() << " repr=" << repr_.DebugString() << "}"; |
+#if 0 |
+ return StringPrintf("{UnicodeText %p chars=%d repr=%s}", |
+ this, |
+ size(), |
+ repr_.DebugString().c_str()); |
+#endif |
+ string result; |
+ ss >> result; |
+ |
+ return result; |
+} |
+ |
+ |
+// ******************* UnicodeText::const_iterator ********************* |
+ |
+// The implementation of const_iterator would be nicer if it |
+// inherited from boost::iterator_facade |
+// (http://boost.org/libs/iterator/doc/iterator_facade.html). |
+ |
+UnicodeText::const_iterator::const_iterator() : it_(0) {} |
+ |
+UnicodeText::const_iterator::const_iterator(const const_iterator& other) |
+ : it_(other.it_) { |
+} |
+ |
+UnicodeText::const_iterator& |
+UnicodeText::const_iterator::operator=(const const_iterator& other) { |
+ if (&other != this) |
+ it_ = other.it_; |
+ return *this; |
+} |
+ |
+UnicodeText::const_iterator UnicodeText::begin() const { |
+ return const_iterator(repr_.data_); |
+} |
+ |
+UnicodeText::const_iterator UnicodeText::end() const { |
+ return const_iterator(repr_.data_ + repr_.size_); |
+} |
+ |
+bool operator<(const UnicodeText::const_iterator& lhs, |
+ const UnicodeText::const_iterator& rhs) { |
+ return lhs.it_ < rhs.it_; |
+} |
+ |
+char32 UnicodeText::const_iterator::operator*() const { |
+ // (We could call chartorune here, but that does some |
+ // error-checking, and we're guaranteed that our data is valid |
+ // UTF-8. Also, we expect this routine to be called very often. So |
+ // for speed, we do the calculation ourselves.) |
+ |
+ // Convert from UTF-8 |
+ int byte1 = it_[0]; |
+ if (byte1 < 0x80) |
+ return byte1; |
+ |
+ int byte2 = it_[1]; |
+ if (byte1 < 0xE0) |
+ return ((byte1 & 0x1F) << 6) |
+ | (byte2 & 0x3F); |
+ |
+ int byte3 = it_[2]; |
+ if (byte1 < 0xF0) |
+ return ((byte1 & 0x0F) << 12) |
+ | ((byte2 & 0x3F) << 6) |
+ | (byte3 & 0x3F); |
+ |
+ int byte4 = it_[3]; |
+ return ((byte1 & 0x07) << 18) |
+ | ((byte2 & 0x3F) << 12) |
+ | ((byte3 & 0x3F) << 6) |
+ | (byte4 & 0x3F); |
+} |
+ |
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { |
+ it_ += UniLib::OneCharLen(it_); |
+ return *this; |
+} |
+ |
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { |
+ while (UniLib::IsTrailByte(*--it_)); |
+ return *this; |
+} |
+ |
+int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { |
+ utf8_output[0] = it_[0]; |
+ if (static_cast<unsigned char>(it_[0]) < 0x80) |
+ return 1; |
+ |
+ utf8_output[1] = it_[1]; |
+ if (static_cast<unsigned char>(it_[0]) < 0xE0) |
+ return 2; |
+ |
+ utf8_output[2] = it_[2]; |
+ if (static_cast<unsigned char>(it_[0]) < 0xF0) |
+ return 3; |
+ |
+ utf8_output[3] = it_[3]; |
+ return 4; |
+} |
+ |
+ |
+UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { |
+ assert(p != NULL); |
+ const char* start = utf8_data(); |
+ int len = utf8_length(); |
+ const char* end = start + len; |
+ assert(p >= start); |
+ assert(p <= end); |
+ assert(p == end || !UniLib::IsTrailByte(*p)); |
+ return const_iterator(p); |
+} |
+ |
+string UnicodeText::const_iterator::DebugString() const { |
+ stringstream ss; |
+ |
+ ss << "{iter " << hex << it_ << "}"; |
+ string result; |
+ ss >> result; |
+ |
+ return result; |
+} |
+ |
Property changes on: third_party\libphonenumber\cpp\src\utf\unicodetext.cc |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |