third_party/libphonenumber/cpp/src/utf/unicodetext.cc - Issue 6803005: Autofill phone number enhancements and integration of Phone Number Util Library: part 1

Unified Diff: third_party/libphonenumber/cpp/src/utf/unicodetext.cc

Issue 6803005: Autofill phone number enhancements and integration of Phone Number Util Library: part 1 (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 9 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« third_party/libphonenumber/chrome/regexp_adapter_icuregexp.cc ('K') | « third_party/libphonenumber/cpp/src/utf/unicodetext.h ('k') | third_party/libphonenumber/cpp/src/utf/unilib.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/libphonenumber/cpp/src/utf/unicodetext.cc

===================================================================

--- third_party/libphonenumber/cpp/src/utf/unicodetext.cc (revision 0)

+++ third_party/libphonenumber/cpp/src/utf/unicodetext.cc (revision 0)

@@ -0,0 +1,515 @@

+//

+// Licensed under the Apache License, Version 2.0 (the "License");

+// you may not use this file except in compliance with the License.

+// You may obtain a copy of the License at

+//

+// http://www.apache.org/licenses/LICENSE-2.0

+//

+// Unless required by applicable law or agreed to in writing, software

+// distributed under the License is distributed on an "AS IS" BASIS,

+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+// See the License for the specific language governing permissions and

+// limitations under the License.

+// Author: Jim Meehan

+#include <iostream>

+#include <sstream>

+#include <cassert>

+#include "utf/unicodetext.h"

+//#include "base/logging.h"

+#include "utf/stringpiece.h"

+//#include "utf/stringprintf.h"

+#include "utf/utf.h"

+#include "utf/unilib.h"

+using std::stringstream;

+using std::max;

+using std::hex;

+using std::dec;

+using std::cerr;

+using std::endl;

+static int CodepointDistance(const char* start, const char* end) {

+ int n = 0;

+ // Increment n on every non-trail-byte.

+ for (const char* p = start; p < end; ++p) {

+ n += (*reinterpret_cast<const signed char*>(p) >= -0x40);

+ }

+ return n;

+static int CodepointCount(const char* utf8, int len) {

+ return CodepointDistance(utf8, utf8 + len);

+UnicodeText::const_iterator::difference_type

+distance(const UnicodeText::const_iterator& first,

+ const UnicodeText::const_iterator& last) {

+ return CodepointDistance(first.it_, last.it_);

+// ---------- Utility ----------

+static int ConvertToInterchangeValid(char* start, int len) {

+ // This routine is called only when we've discovered that a UTF-8 buffer

+ // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8

+ // was not interchange valid. This indicates a bug in the caller, and

+ // a LOG(WARNING) is done in that case.

+ // This is similar to CoerceToInterchangeValid, but it replaces each

+ // structurally valid byte with a space, and each non-interchange

+ // character with a space, even when that character requires more

+ // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is

+ // structurally valid UTF8, but U+FDD0 is not an interchange-valid

+ // code point. The result should contain one space, not three.

+ //

+ // Since the conversion never needs to write more data than it

+ // reads, it is safe to change the buffer in place. It returns the

+ // number of bytes written.

+ char* const in = start;

+ char* out = start;

+ char* const end = start + len;

+ while (start < end) {

+ int good = UniLib::SpanInterchangeValid(start, end - start);

+ if (good > 0) {

+ if (out != start) {

+ memmove(out, start, good);

+ }

+ out += good;

+ start += good;

+ if (start == end) {

+ break;

+ }

+ // Is the current string invalid UTF8 or just non-interchange UTF8?

+ char32 rune;

+ int n;

+ if (isvalidcharntorune(start, end - start, &rune, &n)) {

+ // structurally valid UTF8, but not interchange valid

+ start += n; // Skip over the whole character.

+ } else { // bad UTF8

+ start += 1; // Skip over just one byte

+ }

+ *out++ = ' ';

+ }

+ return out - in;

+// *************** Data representation **********

+// Note: the copy constructor is undefined.

+// After reserve(), resize(), or clear(), we're an owner, not an alias.

+void UnicodeText::Repr::reserve(int new_capacity) {

+ // If there's already enough capacity, and we're an owner, do nothing.

+ if (capacity_ >= new_capacity && ours_) return;

+ // Otherwise, allocate a new buffer.

+ capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);

+ char* new_data = new char[capacity_];

+ // If there is an old buffer, copy it into the new buffer.

+ if (data_) {

+ memcpy(new_data, data_, size_);

+ if (ours_) delete[] data_; // If we owned the old buffer, free it.

+ }

+ data_ = new_data;

+ ours_ = true; // We own the new buffer.

+ // size_ is unchanged.

+void UnicodeText::Repr::resize(int new_size) {

+ if (new_size == 0) {

+ clear();

+ } else {

+ if (!ours_ || new_size > capacity_) reserve(new_size);

+ // Clear the memory in the expanded part.

+ if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);

+ size_ = new_size;

+ ours_ = true;

+ }

+// This implementation of clear() deallocates the buffer if we're an owner.

+// That's not strictly necessary; we could just set size_ to 0.

+void UnicodeText::Repr::clear() {

+ if (ours_) delete[] data_;

+ data_ = NULL;

+ size_ = capacity_ = 0;

+ ours_ = true;

+void UnicodeText::Repr::Copy(const char* data, int size) {

+ resize(size);

+ memcpy(data_, data, size);

+void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {

+ if (data == data_) return; // We already own this memory. (Weird case.)

+ if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.

+ data_ = data;

+ size_ = size;

+ capacity_ = capacity;

+ ours_ = true;

+void UnicodeText::Repr::PointTo(const char* data, int size) {

+ if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.

+ data_ = const_cast<char*>(data);

+ size_ = size;

+ capacity_ = size;

+ ours_ = false;

+void UnicodeText::Repr::append(const char* bytes, int byte_length) {

+ reserve(size_ + byte_length);

+ memcpy(data_ + size_, bytes, byte_length);

+ size_ += byte_length;

+string UnicodeText::Repr::DebugString() const {

+ stringstream ss;

+ ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec

+ << size_ << " capacity=" << capacity_ << " "

+ << (ours_ ? "Owned" : "Alias") << "}";

+ string result;

+ ss >> result;

+ return result;

+// *************** UnicodeText ******************

+// ----- Constructors -----

+// Default constructor

+UnicodeText::UnicodeText() {

+// Copy constructor

+UnicodeText::UnicodeText(const UnicodeText& src) {

+ Copy(src);

+// Substring constructor

+UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,

+ const UnicodeText::const_iterator& last) {

+ assert(first <= last && "Incompatible iterators");

+ repr_.append(first.it_, last.it_ - first.it_);

+string UnicodeText::UTF8Substring(const const_iterator& first,

+ const const_iterator& last) {

+ assert(first <= last && "Incompatible iterators");

+ return string(first.it_, last.it_ - first.it_);

+// ----- Copy -----

+UnicodeText& UnicodeText::operator=(const UnicodeText& src) {

+ if (this != &src) {

+ Copy(src);

+ }

+ return *this;

+UnicodeText& UnicodeText::Copy(const UnicodeText& src) {

+ repr_.Copy(src.repr_.data_, src.repr_.size_);

+ return *this;

+UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {

+ repr_.Copy(buffer, byte_length);

+ if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {

+ cerr << "UTF-8 buffer is not interchange-valid." << endl;

+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

+ }

+ return *this;

+UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,

+ int byte_length) {

+ repr_.Copy(buffer, byte_length);

+ return *this;

+// ----- TakeOwnershipOf -----

+UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,

+ int byte_length,

+ int byte_capacity) {

+ repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

+ if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {

+ cerr << "UTF-8 buffer is not interchange-valid." << endl;

+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

+ }

+ return *this;

+UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,

+ int byte_length,

+ int byte_capacity) {

+ repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

+ return *this;

+// ----- PointTo -----

+UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {

+ if (UniLib:: IsInterchangeValid(buffer, byte_length)) {

+ repr_.PointTo(buffer, byte_length);

+ } else {

+ cerr << "UTF-8 buffer is not interchange-valid." << endl;

+ repr_.Copy(buffer, byte_length);

+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

+ }

+ return *this;

+UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,

+ int byte_length) {

+ repr_.PointTo(buffer, byte_length);

+ return *this;

+UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {

+ repr_.PointTo(src.repr_.data_, src.repr_.size_);

+ return *this;

+UnicodeText& UnicodeText::PointTo(const const_iterator &first,

+ const const_iterator &last) {

+ assert(first <= last && " Incompatible iterators");

+ repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());

+ return *this;

+// ----- Append -----

+UnicodeText& UnicodeText::append(const UnicodeText& u) {

+ repr_.append(u.repr_.data_, u.repr_.size_);

+ return *this;

+UnicodeText& UnicodeText::append(const const_iterator& first,

+ const const_iterator& last) {

+ assert(first <= last && "Incompatible iterators");

+ repr_.append(first.it_, last.it_ - first.it_);

+ return *this;

+UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {

+ repr_.append(utf8, len);

+ return *this;

+// ----- substring searching -----

+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,

+ const_iterator start_pos) const {

+ assert(start_pos.utf8_data() >= utf8_data());

+ assert(start_pos.utf8_data() <= utf8_data() + utf8_length());

+ return UnsafeFind(look, start_pos);

+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {

+ return UnsafeFind(look, begin());

+UnicodeText::const_iterator UnicodeText::UnsafeFind(

+ const UnicodeText& look, const_iterator start_pos) const {

+ // Due to the magic of the UTF8 encoding, searching for a sequence of

+ // letters is equivalent to substring search.

+ StringPiece searching(utf8_data(), utf8_length());

+ StringPiece look_piece(look.utf8_data(), look.utf8_length());

+ StringPiece::size_type found =

+ searching.find(look_piece, start_pos.utf8_data() - utf8_data());

+ if (found == StringPiece::npos) return end();

+ return const_iterator(utf8_data() + found);

+bool UnicodeText::HasReplacementChar() const {

+ // Equivalent to:

+ // UnicodeText replacement_char;

+ // replacement_char.push_back(0xFFFD);

+ // return find(replacement_char) != end();

+ StringPiece searching(utf8_data(), utf8_length());

+ StringPiece looking_for("\xEF\xBF\xBD", 3);

+ return searching.find(looking_for) != StringPiece::npos;

+// ----- other methods -----

+// Clear operator

+void UnicodeText::clear() {

+ repr_.clear();

+// Destructor

+UnicodeText::~UnicodeText() {}

+void UnicodeText::push_back(char32 c) {

+ if (UniLib::IsValidCodepoint(c)) {

+ char buf[UTFmax];

+ int len = runetochar(buf, &c);

+ if (UniLib::IsInterchangeValid(buf, len)) {

+ repr_.append(buf, len);

+ } else {

+ cerr << "Unicode value 0x" << hex << c

+ << " is not valid for interchange" << endl;

+ repr_.append(" ", 1);

+ }

+ } else {

+ cerr << "Illegal Unicode value: 0x" << hex << c << endl;

+ repr_.append(" ", 1);

+ }

+int UnicodeText::size() const {

+ return CodepointCount(repr_.data_, repr_.size_);

+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {

+ if (&lhs == &rhs) return true;

+ if (lhs.repr_.size_ != rhs.repr_.size_) return false;

+ return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;

+string UnicodeText::DebugString() const {

+ stringstream ss;

+ ss << "{UnicodeText " << hex << this << dec << " chars="

+ << size() << " repr=" << repr_.DebugString() << "}";

+#if 0

+ return StringPrintf("{UnicodeText %p chars=%d repr=%s}",

+ this,

+ size(),

+ repr_.DebugString().c_str());

+#endif

+ string result;

+ ss >> result;

+ return result;

+// ******************* UnicodeText::const_iterator *********************

+// The implementation of const_iterator would be nicer if it

+// inherited from boost::iterator_facade

+// (http://boost.org/libs/iterator/doc/iterator_facade.html).

+UnicodeText::const_iterator::const_iterator() : it_(0) {}

+UnicodeText::const_iterator::const_iterator(const const_iterator& other)

+ : it_(other.it_) {

+UnicodeText::const_iterator&

+UnicodeText::const_iterator::operator=(const const_iterator& other) {

+ if (&other != this)

+ it_ = other.it_;

+ return *this;

+UnicodeText::const_iterator UnicodeText::begin() const {

+ return const_iterator(repr_.data_);

+UnicodeText::const_iterator UnicodeText::end() const {

+ return const_iterator(repr_.data_ + repr_.size_);

+bool operator<(const UnicodeText::const_iterator& lhs,

+ const UnicodeText::const_iterator& rhs) {

+ return lhs.it_ < rhs.it_;

+char32 UnicodeText::const_iterator::operator*() const {

+ // (We could call chartorune here, but that does some

+ // error-checking, and we're guaranteed that our data is valid

+ // UTF-8. Also, we expect this routine to be called very often. So

+ // for speed, we do the calculation ourselves.)

+ // Convert from UTF-8

+ int byte1 = it_[0];

+ if (byte1 < 0x80)

+ return byte1;

+ int byte2 = it_[1];

+ if (byte1 < 0xE0)

+ return ((byte1 & 0x1F) << 6)

+ | (byte2 & 0x3F);

+ int byte3 = it_[2];

+ if (byte1 < 0xF0)

+ return ((byte1 & 0x0F) << 12)

+ | ((byte2 & 0x3F) << 6)

+ | (byte3 & 0x3F);

+ int byte4 = it_[3];

+ return ((byte1 & 0x07) << 18)

+ | ((byte2 & 0x3F) << 12)

+ | ((byte3 & 0x3F) << 6)

+ | (byte4 & 0x3F);

+UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {

+ it_ += UniLib::OneCharLen(it_);

+ return *this;

+UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {

+ while (UniLib::IsTrailByte(*--it_));

+ return *this;

+int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {

+ utf8_output[0] = it_[0];

+ if (static_cast<unsigned char>(it_[0]) < 0x80)

+ return 1;

+ utf8_output[1] = it_[1];

+ if (static_cast<unsigned char>(it_[0]) < 0xE0)

+ return 2;

+ utf8_output[2] = it_[2];

+ if (static_cast<unsigned char>(it_[0]) < 0xF0)

+ return 3;

+ utf8_output[3] = it_[3];

+ return 4;

+UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {

+ assert(p != NULL);

+ const char* start = utf8_data();

+ int len = utf8_length();

+ const char* end = start + len;

+ assert(p >= start);

+ assert(p <= end);

+ assert(p == end || !UniLib::IsTrailByte(*p));

+ return const_iterator(p);

+string UnicodeText::const_iterator::DebugString() const {

+ stringstream ss;

+ ss << "{iter " << hex << it_ << "}";

+ string result;

+ ss >> result;

+ return result;

Property changes on: third_party\libphonenumber\cpp\src\utf\unicodetext.cc

___________________________________________________________________

Added: svn:eol-style

+ LF