third_party/libphonenumber/cpp/src/utf/unicodetext.cc - Issue 6803005: Autofill phone number enhancements and integration of Phone Number Util Library: part 1

Side by Side Diff: third_party/libphonenumber/cpp/src/utf/unicodetext.cc

Issue 6803005: Autofill phone number enhancements and integration of Phone Number Util Library: part 1 (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 9 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« third_party/libphonenumber/chrome/regexp_adapter_icuregexp.cc ('K') | « third_party/libphonenumber/cpp/src/utf/unicodetext.h ('k') | third_party/libphonenumber/cpp/src/utf/unilib.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright (C) 2006 Google Inc.

	2 //

	3 // Licensed under the Apache License, Version 2.0 (the "License");

	4 // you may not use this file except in compliance with the License.

	5 // You may obtain a copy of the License at

	6 //

	7 // http://www.apache.org/licenses/LICENSE-2.0

	8 //

	9 // Unless required by applicable law or agreed to in writing, software

	10 // distributed under the License is distributed on an "AS IS" BASIS,

	11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	12 // See the License for the specific language governing permissions and

	13 // limitations under the License.

	14

	15 // Author: Jim Meehan

	16

	17 #include <iostream>

	18 #include <sstream>

	19 #include <cassert>

	20

	21 #include "utf/unicodetext.h"

	22 //#include "base/logging.h"

	23 #include "utf/stringpiece.h"

	24 //#include "utf/stringprintf.h"

	25 #include "utf/utf.h"

	26 #include "utf/unilib.h"

	27

	28 using std::stringstream;

	29 using std::max;

	30 using std::hex;

	31 using std::dec;

	32 using std::cerr;

	33 using std::endl;

	34

	35 static int CodepointDistance(const char* start, const char* end) {

	36 int n = 0;

	37 // Increment n on every non-trail-byte.

	38 for (const char* p = start; p < end; ++p) {

	39 n += (reinterpret_cast<const signed char>(p) >= -0x40);

	40 }

	41 return n;

	42 }

	43

	44 static int CodepointCount(const char* utf8, int len) {

	45 return CodepointDistance(utf8, utf8 + len);

	46 }

	47

	48 UnicodeText::const_iterator::difference_type

	49 distance(const UnicodeText::const_iterator& first,

	50 const UnicodeText::const_iterator& last) {

	51 return CodepointDistance(first.it_, last.it_);

	52 }

	53

	54 // ---------- Utility ----------

	55

	56 static int ConvertToInterchangeValid(char* start, int len) {

	57 // This routine is called only when we've discovered that a UTF-8 buffer

	58 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8

	59 // was not interchange valid. This indicates a bug in the caller, and

	60 // a LOG(WARNING) is done in that case.

	61 // This is similar to CoerceToInterchangeValid, but it replaces each

	62 // structurally valid byte with a space, and each non-interchange

	63 // character with a space, even when that character requires more

	64 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is

	65 // structurally valid UTF8, but U+FDD0 is not an interchange-valid

	66 // code point. The result should contain one space, not three.

	67 //

	68 // Since the conversion never needs to write more data than it

	69 // reads, it is safe to change the buffer in place. It returns the

	70 // number of bytes written.

	71 char* const in = start;

	72 char* out = start;

	73 char* const end = start + len;

	74 while (start < end) {

	75 int good = UniLib::SpanInterchangeValid(start, end - start);

	76 if (good > 0) {

	77 if (out != start) {

	78 memmove(out, start, good);

	79 }

	80 out += good;

	81 start += good;

	82 if (start == end) {

	83 break;

	84 }

	85 }

	86 // Is the current string invalid UTF8 or just non-interchange UTF8?

	87 char32 rune;

	88 int n;

	89 if (isvalidcharntorune(start, end - start, &rune, &n)) {

	90 // structurally valid UTF8, but not interchange valid

	91 start += n; // Skip over the whole character.

	92 } else { // bad UTF8

	93 start += 1; // Skip over just one byte

	94 }

	95 *out++ = ' ';

	96 }

	97 return out - in;

	98 }

	99

	100

	101 // ************* Data representation ********

	102

	103 // Note: the copy constructor is undefined.

	104

	105 // After reserve(), resize(), or clear(), we're an owner, not an alias.

	106

	107 void UnicodeText::Repr::reserve(int new_capacity) {

	108 // If there's already enough capacity, and we're an owner, do nothing.

	109 if (capacity_ >= new_capacity && ours_) return;

	110

	111 // Otherwise, allocate a new buffer.

	112 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);

	113 char* new_data = new char[capacity_];

	114

	115 // If there is an old buffer, copy it into the new buffer.

	116 if (data_) {

	117 memcpy(new_data, data_, size_);

	118 if (ours_) delete[] data_; // If we owned the old buffer, free it.

	119 }

	120 data_ = new_data;

	121 ours_ = true; // We own the new buffer.

	122 // size_ is unchanged.

	123 }

	124

	125 void UnicodeText::Repr::resize(int new_size) {

	126 if (new_size == 0) {

	127 clear();

	128 } else {

	129 if (!ours_ \|\| new_size > capacity_) reserve(new_size);

	130 // Clear the memory in the expanded part.

	131 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);

	132 size_ = new_size;

	133 ours_ = true;

	134 }

	135 }

	136

	137 // This implementation of clear() deallocates the buffer if we're an owner.

	138 // That's not strictly necessary; we could just set size_ to 0.

	139 void UnicodeText::Repr::clear() {

	140 if (ours_) delete[] data_;

	141 data_ = NULL;

	142 size_ = capacity_ = 0;

	143 ours_ = true;

	144 }

	145

	146 void UnicodeText::Repr::Copy(const char* data, int size) {

	147 resize(size);

	148 memcpy(data_, data, size);

	149 }

	150

	151 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {

	152 if (data == data_) return; // We already own this memory. (Weird case.)

	153 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.

	154 data_ = data;

	155 size_ = size;

	156 capacity_ = capacity;

	157 ours_ = true;

	158 }

	159

	160 void UnicodeText::Repr::PointTo(const char* data, int size) {

	161 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.

	162 data_ = const_cast<char*>(data);

	163 size_ = size;

	164 capacity_ = size;

	165 ours_ = false;

	166 }

	167

	168 void UnicodeText::Repr::append(const char* bytes, int byte_length) {

	169 reserve(size_ + byte_length);

	170 memcpy(data_ + size_, bytes, byte_length);

	171 size_ += byte_length;

	172 }

	173

	174 string UnicodeText::Repr::DebugString() const {

	175 stringstream ss;

	176

	177 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec

	178 << size_ << " capacity=" << capacity_ << " "

	179 << (ours_ ? "Owned" : "Alias") << "}";

	180

	181 string result;

	182 ss >> result;

	183

	184 return result;

	185 }

	186

	187

	188

	189 // ************* UnicodeText ****************

	190

	191 // ----- Constructors -----

	192

	193 // Default constructor

	194 UnicodeText::UnicodeText() {

	195 }

	196

	197 // Copy constructor

	198 UnicodeText::UnicodeText(const UnicodeText& src) {

	199 Copy(src);

	200 }

	201

	202 // Substring constructor

	203 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,

	204 const UnicodeText::const_iterator& last) {

	205 assert(first <= last && "Incompatible iterators");

	206 repr_.append(first.it_, last.it_ - first.it_);

	207 }

	208

	209 string UnicodeText::UTF8Substring(const const_iterator& first,

	210 const const_iterator& last) {

	211 assert(first <= last && "Incompatible iterators");

	212 return string(first.it_, last.it_ - first.it_);

	213 }

	214

	215

	216 // ----- Copy -----

	217

	218 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {

	219 if (this != &src) {

	220 Copy(src);

	221 }

	222 return *this;

	223 }

	224

	225 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {

	226 repr_.Copy(src.repr_.data_, src.repr_.size_);

	227 return *this;

	228 }

	229

	230 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {

	231 repr_.Copy(buffer, byte_length);

	232 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {

	233 cerr << "UTF-8 buffer is not interchange-valid." << endl;

	234 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

	235 }

	236 return *this;

	237 }

	238

	239 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,

	240 int byte_length) {

	241 repr_.Copy(buffer, byte_length);

	242 return *this;

	243 }

	244

	245 // ----- TakeOwnershipOf -----

	246

	247 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,

	248 int byte_length,

	249 int byte_capacity) {

	250 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

	251 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {

	252 cerr << "UTF-8 buffer is not interchange-valid." << endl;

	253 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

	254 }

	255 return *this;

	256 }

	257

	258 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,

	259 int byte_length,

	260 int byte_capacity) {

	261 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

	262 return *this;

	263 }

	264

	265 // ----- PointTo -----

	266

	267 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {

	268 if (UniLib:: IsInterchangeValid(buffer, byte_length)) {

	269 repr_.PointTo(buffer, byte_length);

	270 } else {

	271 cerr << "UTF-8 buffer is not interchange-valid." << endl;

	272 repr_.Copy(buffer, byte_length);

	273 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

	274 }

	275 return *this;

	276 }

	277

	278 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,

	279 int byte_length) {

	280 repr_.PointTo(buffer, byte_length);

	281 return *this;

	282 }

	283

	284 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {

	285 repr_.PointTo(src.repr_.data_, src.repr_.size_);

	286 return *this;

	287 }

	288

	289 UnicodeText& UnicodeText::PointTo(const const_iterator &first,

	290 const const_iterator &last) {

	291 assert(first <= last && " Incompatible iterators");

	292 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());

	293 return *this;

	294 }

	295

	296 // ----- Append -----

	297

	298 UnicodeText& UnicodeText::append(const UnicodeText& u) {

	299 repr_.append(u.repr_.data_, u.repr_.size_);

	300 return *this;

	301 }

	302

	303 UnicodeText& UnicodeText::append(const const_iterator& first,

	304 const const_iterator& last) {

	305 assert(first <= last && "Incompatible iterators");

	306 repr_.append(first.it_, last.it_ - first.it_);

	307 return *this;

	308 }

	309

	310 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {

	311 repr_.append(utf8, len);

	312 return *this;

	313 }

	314

	315 // ----- substring searching -----

	316

	317 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,

	318 const_iterator start_pos) const {

	319 assert(start_pos.utf8_data() >= utf8_data());

	320 assert(start_pos.utf8_data() <= utf8_data() + utf8_length());

	321 return UnsafeFind(look, start_pos);

	322 }

	323

	324 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {

	325 return UnsafeFind(look, begin());

	326 }

	327

	328 UnicodeText::const_iterator UnicodeText::UnsafeFind(

	329 const UnicodeText& look, const_iterator start_pos) const {

	330 // Due to the magic of the UTF8 encoding, searching for a sequence of

	331 // letters is equivalent to substring search.

	332 StringPiece searching(utf8_data(), utf8_length());

	333 StringPiece look_piece(look.utf8_data(), look.utf8_length());

	334 StringPiece::size_type found =

	335 searching.find(look_piece, start_pos.utf8_data() - utf8_data());

	336 if (found == StringPiece::npos) return end();

	337 return const_iterator(utf8_data() + found);

	338 }

	339

	340 bool UnicodeText::HasReplacementChar() const {

	341 // Equivalent to:

	342 // UnicodeText replacement_char;

	343 // replacement_char.push_back(0xFFFD);

	344 // return find(replacement_char) != end();

	345 StringPiece searching(utf8_data(), utf8_length());

	346 StringPiece looking_for("\xEF\xBF\xBD", 3);

	347 return searching.find(looking_for) != StringPiece::npos;

	348 }

	349

	350 // ----- other methods -----

	351

	352 // Clear operator

	353 void UnicodeText::clear() {

	354 repr_.clear();

	355 }

	356

	357 // Destructor

	358 UnicodeText::~UnicodeText() {}

	359

	360

	361 void UnicodeText::push_back(char32 c) {

	362 if (UniLib::IsValidCodepoint(c)) {

	363 char buf[UTFmax];

	364 int len = runetochar(buf, &c);

	365 if (UniLib::IsInterchangeValid(buf, len)) {

	366 repr_.append(buf, len);

	367 } else {

	368 cerr << "Unicode value 0x" << hex << c

	369 << " is not valid for interchange" << endl;

	370 repr_.append(" ", 1);

	371 }

	372 } else {

	373 cerr << "Illegal Unicode value: 0x" << hex << c << endl;

	374 repr_.append(" ", 1);

	375 }

	376 }

	377

	378 int UnicodeText::size() const {

	379 return CodepointCount(repr_.data_, repr_.size_);

	380 }

	381

	382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {

	383 if (&lhs == &rhs) return true;

	384 if (lhs.repr_.size_ != rhs.repr_.size_) return false;

	385 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;

	386 }

	387

	388 string UnicodeText::DebugString() const {

	389 stringstream ss;

	390

	391 ss << "{UnicodeText " << hex << this << dec << " chars="

	392 << size() << " repr=" << repr_.DebugString() << "}";

	393 #if 0

	394 return StringPrintf("{UnicodeText %p chars=%d repr=%s}",

	395 this,

	396 size(),

	397 repr_.DebugString().c_str());

	398 #endif

	399 string result;

	400 ss >> result;

	401

	402 return result;

	403 }

	404

	405

	406 // ***************** UnicodeText::const_iterator *******************

	407

	408 // The implementation of const_iterator would be nicer if it

	409 // inherited from boost::iterator_facade

	410 // (http://boost.org/libs/iterator/doc/iterator_facade.html).

	411

	412 UnicodeText::const_iterator::const_iterator() : it_(0) {}

	413

	414 UnicodeText::const_iterator::const_iterator(const const_iterator& other)

	415 : it_(other.it_) {

	416 }

	417

	418 UnicodeText::const_iterator&

	419 UnicodeText::const_iterator::operator=(const const_iterator& other) {

	420 if (&other != this)

	421 it_ = other.it_;

	422 return *this;

	423 }

	424

	425 UnicodeText::const_iterator UnicodeText::begin() const {

	426 return const_iterator(repr_.data_);

	427 }

	428

	429 UnicodeText::const_iterator UnicodeText::end() const {

	430 return const_iterator(repr_.data_ + repr_.size_);

	431 }

	432

	433 bool operator<(const UnicodeText::const_iterator& lhs,

	434 const UnicodeText::const_iterator& rhs) {

	435 return lhs.it_ < rhs.it_;

	436 }

	437

	438 char32 UnicodeText::const_iterator::operator*() const {

	439 // (We could call chartorune here, but that does some

	440 // error-checking, and we're guaranteed that our data is valid

	441 // UTF-8. Also, we expect this routine to be called very often. So

	442 // for speed, we do the calculation ourselves.)

	443

	444 // Convert from UTF-8

	445 int byte1 = it_[0];

	446 if (byte1 < 0x80)

	447 return byte1;

	448

	449 int byte2 = it_[1];

	450 if (byte1 < 0xE0)

	451 return ((byte1 & 0x1F) << 6)

	452 \| (byte2 & 0x3F);

	453

	454 int byte3 = it_[2];

	455 if (byte1 < 0xF0)

	456 return ((byte1 & 0x0F) << 12)

	457 \| ((byte2 & 0x3F) << 6)

	458 \| (byte3 & 0x3F);

	459

	460 int byte4 = it_[3];

	461 return ((byte1 & 0x07) << 18)

	462 \| ((byte2 & 0x3F) << 12)

	463 \| ((byte3 & 0x3F) << 6)

	464 \| (byte4 & 0x3F);

	465 }

	466

	467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {

	468 it_ += UniLib::OneCharLen(it_);

	469 return *this;

	470 }

	471

	472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {

	473 while (UniLib::IsTrailByte(*--it_));

	474 return *this;

	475 }

	476

	477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {

	478 utf8_output[0] = it_[0];

	479 if (static_cast<unsigned char>(it_[0]) < 0x80)

	480 return 1;

	481

	482 utf8_output[1] = it_[1];

	483 if (static_cast<unsigned char>(it_[0]) < 0xE0)

	484 return 2;

	485

	486 utf8_output[2] = it_[2];

	487 if (static_cast<unsigned char>(it_[0]) < 0xF0)

	488 return 3;

	489

	490 utf8_output[3] = it_[3];

	491 return 4;

	492 }

	493

	494

	495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {

	496 assert(p != NULL);

	497 const char* start = utf8_data();

	498 int len = utf8_length();

	499 const char* end = start + len;

	500 assert(p >= start);

	501 assert(p <= end);

	502 assert(p == end \|\| !UniLib::IsTrailByte(*p));

	503 return const_iterator(p);

	504 }

	505

	506 string UnicodeText::const_iterator::DebugString() const {

	507 stringstream ss;

	508

	509 ss << "{iter " << hex << it_ << "}";

	510 string result;

	511 ss >> result;

	512

	513 return result;

	514 }

	515

OLD	NEW