Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1309)

Side by Side Diff: third_party/libphonenumber/cpp/src/utf/unicodetext.cc

Issue 6930013: Re-committing http://codereview.chromium.org/6803005/ after fixing multi-dll build: (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 9 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 // Copyright (C) 2006 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Author: Jim Meehan
16
17 #include <iostream>
18 #include <sstream>
19 #include <cassert>
20
21 #include "utf/unicodetext.h"
22 //#include "base/logging.h"
23 #include "utf/stringpiece.h"
24 //#include "utf/stringprintf.h"
25 #include "utf/utf.h"
26 #include "utf/unilib.h"
27
28 using std::stringstream;
29 using std::max;
30 using std::hex;
31 using std::dec;
32 using std::cerr;
33 using std::endl;
34
35 static int CodepointDistance(const char* start, const char* end) {
36 int n = 0;
37 // Increment n on every non-trail-byte.
38 for (const char* p = start; p < end; ++p) {
39 n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
40 }
41 return n;
42 }
43
44 static int CodepointCount(const char* utf8, int len) {
45 return CodepointDistance(utf8, utf8 + len);
46 }
47
48 UnicodeText::const_iterator::difference_type
49 distance(const UnicodeText::const_iterator& first,
50 const UnicodeText::const_iterator& last) {
51 return CodepointDistance(first.it_, last.it_);
52 }
53
54 // ---------- Utility ----------
55
56 static int ConvertToInterchangeValid(char* start, int len) {
57 // This routine is called only when we've discovered that a UTF-8 buffer
58 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
59 // was not interchange valid. This indicates a bug in the caller, and
60 // a LOG(WARNING) is done in that case.
61 // This is similar to CoerceToInterchangeValid, but it replaces each
62 // structurally valid byte with a space, and each non-interchange
63 // character with a space, even when that character requires more
64 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
65 // structurally valid UTF8, but U+FDD0 is not an interchange-valid
66 // code point. The result should contain one space, not three.
67 //
68 // Since the conversion never needs to write more data than it
69 // reads, it is safe to change the buffer in place. It returns the
70 // number of bytes written.
71 char* const in = start;
72 char* out = start;
73 char* const end = start + len;
74 while (start < end) {
75 int good = UniLib::SpanInterchangeValid(start, end - start);
76 if (good > 0) {
77 if (out != start) {
78 memmove(out, start, good);
79 }
80 out += good;
81 start += good;
82 if (start == end) {
83 break;
84 }
85 }
86 // Is the current string invalid UTF8 or just non-interchange UTF8?
87 char32 rune;
88 int n;
89 if (isvalidcharntorune(start, end - start, &rune, &n)) {
90 // structurally valid UTF8, but not interchange valid
91 start += n; // Skip over the whole character.
92 } else { // bad UTF8
93 start += 1; // Skip over just one byte
94 }
95 *out++ = ' ';
96 }
97 return out - in;
98 }
99
100
101 // *************** Data representation **********
102
103 // Note: the copy constructor is undefined.
104
105 // After reserve(), resize(), or clear(), we're an owner, not an alias.
106
107 void UnicodeText::Repr::reserve(int new_capacity) {
108 // If there's already enough capacity, and we're an owner, do nothing.
109 if (capacity_ >= new_capacity && ours_) return;
110
111 // Otherwise, allocate a new buffer.
112 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
113 char* new_data = new char[capacity_];
114
115 // If there is an old buffer, copy it into the new buffer.
116 if (data_) {
117 memcpy(new_data, data_, size_);
118 if (ours_) delete[] data_; // If we owned the old buffer, free it.
119 }
120 data_ = new_data;
121 ours_ = true; // We own the new buffer.
122 // size_ is unchanged.
123 }
124
125 void UnicodeText::Repr::resize(int new_size) {
126 if (new_size == 0) {
127 clear();
128 } else {
129 if (!ours_ || new_size > capacity_) reserve(new_size);
130 // Clear the memory in the expanded part.
131 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
132 size_ = new_size;
133 ours_ = true;
134 }
135 }
136
137 // This implementation of clear() deallocates the buffer if we're an owner.
138 // That's not strictly necessary; we could just set size_ to 0.
139 void UnicodeText::Repr::clear() {
140 if (ours_) delete[] data_;
141 data_ = NULL;
142 size_ = capacity_ = 0;
143 ours_ = true;
144 }
145
146 void UnicodeText::Repr::Copy(const char* data, int size) {
147 resize(size);
148 memcpy(data_, data, size);
149 }
150
151 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
152 if (data == data_) return; // We already own this memory. (Weird case.)
153 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
154 data_ = data;
155 size_ = size;
156 capacity_ = capacity;
157 ours_ = true;
158 }
159
160 void UnicodeText::Repr::PointTo(const char* data, int size) {
161 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
162 data_ = const_cast<char*>(data);
163 size_ = size;
164 capacity_ = size;
165 ours_ = false;
166 }
167
168 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
169 reserve(size_ + byte_length);
170 memcpy(data_ + size_, bytes, byte_length);
171 size_ += byte_length;
172 }
173
174 string UnicodeText::Repr::DebugString() const {
175 stringstream ss;
176
177 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
178 << size_ << " capacity=" << capacity_ << " "
179 << (ours_ ? "Owned" : "Alias") << "}";
180
181 string result;
182 ss >> result;
183
184 return result;
185 }
186
187
188
189 // *************** UnicodeText ******************
190
191 // ----- Constructors -----
192
193 // Default constructor
194 UnicodeText::UnicodeText() {
195 }
196
197 // Copy constructor
198 UnicodeText::UnicodeText(const UnicodeText& src) {
199 Copy(src);
200 }
201
202 // Substring constructor
203 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
204 const UnicodeText::const_iterator& last) {
205 assert(first <= last && "Incompatible iterators");
206 repr_.append(first.it_, last.it_ - first.it_);
207 }
208
209 string UnicodeText::UTF8Substring(const const_iterator& first,
210 const const_iterator& last) {
211 assert(first <= last && "Incompatible iterators");
212 return string(first.it_, last.it_ - first.it_);
213 }
214
215
216 // ----- Copy -----
217
218 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
219 if (this != &src) {
220 Copy(src);
221 }
222 return *this;
223 }
224
225 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
226 repr_.Copy(src.repr_.data_, src.repr_.size_);
227 return *this;
228 }
229
230 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
231 repr_.Copy(buffer, byte_length);
232 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
233 cerr << "UTF-8 buffer is not interchange-valid." << endl;
234 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
235 }
236 return *this;
237 }
238
239 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
240 int byte_length) {
241 repr_.Copy(buffer, byte_length);
242 return *this;
243 }
244
245 // ----- TakeOwnershipOf -----
246
247 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
248 int byte_length,
249 int byte_capacity) {
250 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
251 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
252 cerr << "UTF-8 buffer is not interchange-valid." << endl;
253 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
254 }
255 return *this;
256 }
257
258 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
259 int byte_length,
260 int byte_capacity) {
261 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
262 return *this;
263 }
264
265 // ----- PointTo -----
266
267 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
268 if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
269 repr_.PointTo(buffer, byte_length);
270 } else {
271 cerr << "UTF-8 buffer is not interchange-valid." << endl;
272 repr_.Copy(buffer, byte_length);
273 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
274 }
275 return *this;
276 }
277
278 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
279 int byte_length) {
280 repr_.PointTo(buffer, byte_length);
281 return *this;
282 }
283
284 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
285 repr_.PointTo(src.repr_.data_, src.repr_.size_);
286 return *this;
287 }
288
289 UnicodeText& UnicodeText::PointTo(const const_iterator &first,
290 const const_iterator &last) {
291 assert(first <= last && " Incompatible iterators");
292 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
293 return *this;
294 }
295
296 // ----- Append -----
297
298 UnicodeText& UnicodeText::append(const UnicodeText& u) {
299 repr_.append(u.repr_.data_, u.repr_.size_);
300 return *this;
301 }
302
303 UnicodeText& UnicodeText::append(const const_iterator& first,
304 const const_iterator& last) {
305 assert(first <= last && "Incompatible iterators");
306 repr_.append(first.it_, last.it_ - first.it_);
307 return *this;
308 }
309
310 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
311 repr_.append(utf8, len);
312 return *this;
313 }
314
315 // ----- substring searching -----
316
317 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
318 const_iterator start_pos) const {
319 assert(start_pos.utf8_data() >= utf8_data());
320 assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
321 return UnsafeFind(look, start_pos);
322 }
323
324 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
325 return UnsafeFind(look, begin());
326 }
327
328 UnicodeText::const_iterator UnicodeText::UnsafeFind(
329 const UnicodeText& look, const_iterator start_pos) const {
330 // Due to the magic of the UTF8 encoding, searching for a sequence of
331 // letters is equivalent to substring search.
332 StringPiece searching(utf8_data(), utf8_length());
333 StringPiece look_piece(look.utf8_data(), look.utf8_length());
334 StringPiece::size_type found =
335 searching.find(look_piece, start_pos.utf8_data() - utf8_data());
336 if (found == StringPiece::npos) return end();
337 return const_iterator(utf8_data() + found);
338 }
339
340 bool UnicodeText::HasReplacementChar() const {
341 // Equivalent to:
342 // UnicodeText replacement_char;
343 // replacement_char.push_back(0xFFFD);
344 // return find(replacement_char) != end();
345 StringPiece searching(utf8_data(), utf8_length());
346 StringPiece looking_for("\xEF\xBF\xBD", 3);
347 return searching.find(looking_for) != StringPiece::npos;
348 }
349
350 // ----- other methods -----
351
352 // Clear operator
353 void UnicodeText::clear() {
354 repr_.clear();
355 }
356
357 // Destructor
358 UnicodeText::~UnicodeText() {}
359
360
361 void UnicodeText::push_back(char32 c) {
362 if (UniLib::IsValidCodepoint(c)) {
363 char buf[UTFmax];
364 int len = runetochar(buf, &c);
365 if (UniLib::IsInterchangeValid(buf, len)) {
366 repr_.append(buf, len);
367 } else {
368 cerr << "Unicode value 0x" << hex << c
369 << " is not valid for interchange" << endl;
370 repr_.append(" ", 1);
371 }
372 } else {
373 cerr << "Illegal Unicode value: 0x" << hex << c << endl;
374 repr_.append(" ", 1);
375 }
376 }
377
378 int UnicodeText::size() const {
379 return CodepointCount(repr_.data_, repr_.size_);
380 }
381
382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
383 if (&lhs == &rhs) return true;
384 if (lhs.repr_.size_ != rhs.repr_.size_) return false;
385 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
386 }
387
388 string UnicodeText::DebugString() const {
389 stringstream ss;
390
391 ss << "{UnicodeText " << hex << this << dec << " chars="
392 << size() << " repr=" << repr_.DebugString() << "}";
393 #if 0
394 return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
395 this,
396 size(),
397 repr_.DebugString().c_str());
398 #endif
399 string result;
400 ss >> result;
401
402 return result;
403 }
404
405
406 // ******************* UnicodeText::const_iterator *********************
407
408 // The implementation of const_iterator would be nicer if it
409 // inherited from boost::iterator_facade
410 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
411
412 UnicodeText::const_iterator::const_iterator() : it_(0) {}
413
414 UnicodeText::const_iterator::const_iterator(const const_iterator& other)
415 : it_(other.it_) {
416 }
417
418 UnicodeText::const_iterator&
419 UnicodeText::const_iterator::operator=(const const_iterator& other) {
420 if (&other != this)
421 it_ = other.it_;
422 return *this;
423 }
424
425 UnicodeText::const_iterator UnicodeText::begin() const {
426 return const_iterator(repr_.data_);
427 }
428
429 UnicodeText::const_iterator UnicodeText::end() const {
430 return const_iterator(repr_.data_ + repr_.size_);
431 }
432
433 bool operator<(const UnicodeText::const_iterator& lhs,
434 const UnicodeText::const_iterator& rhs) {
435 return lhs.it_ < rhs.it_;
436 }
437
438 char32 UnicodeText::const_iterator::operator*() const {
439 // (We could call chartorune here, but that does some
440 // error-checking, and we're guaranteed that our data is valid
441 // UTF-8. Also, we expect this routine to be called very often. So
442 // for speed, we do the calculation ourselves.)
443
444 // Convert from UTF-8
445 int byte1 = it_[0];
446 if (byte1 < 0x80)
447 return byte1;
448
449 int byte2 = it_[1];
450 if (byte1 < 0xE0)
451 return ((byte1 & 0x1F) << 6)
452 | (byte2 & 0x3F);
453
454 int byte3 = it_[2];
455 if (byte1 < 0xF0)
456 return ((byte1 & 0x0F) << 12)
457 | ((byte2 & 0x3F) << 6)
458 | (byte3 & 0x3F);
459
460 int byte4 = it_[3];
461 return ((byte1 & 0x07) << 18)
462 | ((byte2 & 0x3F) << 12)
463 | ((byte3 & 0x3F) << 6)
464 | (byte4 & 0x3F);
465 }
466
467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
468 it_ += UniLib::OneCharLen(it_);
469 return *this;
470 }
471
472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
473 while (UniLib::IsTrailByte(*--it_));
474 return *this;
475 }
476
477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
478 utf8_output[0] = it_[0];
479 if (static_cast<unsigned char>(it_[0]) < 0x80)
480 return 1;
481
482 utf8_output[1] = it_[1];
483 if (static_cast<unsigned char>(it_[0]) < 0xE0)
484 return 2;
485
486 utf8_output[2] = it_[2];
487 if (static_cast<unsigned char>(it_[0]) < 0xF0)
488 return 3;
489
490 utf8_output[3] = it_[3];
491 return 4;
492 }
493
494
495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
496 assert(p != NULL);
497 const char* start = utf8_data();
498 int len = utf8_length();
499 const char* end = start + len;
500 assert(p >= start);
501 assert(p <= end);
502 assert(p == end || !UniLib::IsTrailByte(*p));
503 return const_iterator(p);
504 }
505
506 string UnicodeText::const_iterator::DebugString() const {
507 stringstream ss;
508
509 ss << "{iter " << hex << it_ << "}";
510 string result;
511 ss >> result;
512
513 return result;
514 }
515
OLDNEW
« no previous file with comments | « third_party/libphonenumber/cpp/src/utf/unicodetext.h ('k') | third_party/libphonenumber/cpp/src/utf/unilib.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698