OLD | NEW |
| (Empty) |
1 // Copyright (C) 2006 Google Inc. | |
2 // | |
3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
4 // you may not use this file except in compliance with the License. | |
5 // You may obtain a copy of the License at | |
6 // | |
7 // http://www.apache.org/licenses/LICENSE-2.0 | |
8 // | |
9 // Unless required by applicable law or agreed to in writing, software | |
10 // distributed under the License is distributed on an "AS IS" BASIS, | |
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 // See the License for the specific language governing permissions and | |
13 // limitations under the License. | |
14 | |
15 // Author: Jim Meehan | |
16 | |
17 #include <iostream> | |
18 #include <sstream> | |
19 #include <cassert> | |
20 | |
21 #include "utf/unicodetext.h" | |
22 //#include "base/logging.h" | |
23 #include "utf/stringpiece.h" | |
24 //#include "utf/stringprintf.h" | |
25 #include "utf/utf.h" | |
26 #include "utf/unilib.h" | |
27 | |
28 using std::stringstream; | |
29 using std::max; | |
30 using std::hex; | |
31 using std::dec; | |
32 using std::cerr; | |
33 using std::endl; | |
34 | |
35 static int CodepointDistance(const char* start, const char* end) { | |
36 int n = 0; | |
37 // Increment n on every non-trail-byte. | |
38 for (const char* p = start; p < end; ++p) { | |
39 n += (*reinterpret_cast<const signed char*>(p) >= -0x40); | |
40 } | |
41 return n; | |
42 } | |
43 | |
44 static int CodepointCount(const char* utf8, int len) { | |
45 return CodepointDistance(utf8, utf8 + len); | |
46 } | |
47 | |
48 UnicodeText::const_iterator::difference_type | |
49 distance(const UnicodeText::const_iterator& first, | |
50 const UnicodeText::const_iterator& last) { | |
51 return CodepointDistance(first.it_, last.it_); | |
52 } | |
53 | |
54 // ---------- Utility ---------- | |
55 | |
56 static int ConvertToInterchangeValid(char* start, int len) { | |
57 // This routine is called only when we've discovered that a UTF-8 buffer | |
58 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 | |
59 // was not interchange valid. This indicates a bug in the caller, and | |
60 // a LOG(WARNING) is done in that case. | |
61 // This is similar to CoerceToInterchangeValid, but it replaces each | |
62 // structurally valid byte with a space, and each non-interchange | |
63 // character with a space, even when that character requires more | |
64 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is | |
65 // structurally valid UTF8, but U+FDD0 is not an interchange-valid | |
66 // code point. The result should contain one space, not three. | |
67 // | |
68 // Since the conversion never needs to write more data than it | |
69 // reads, it is safe to change the buffer in place. It returns the | |
70 // number of bytes written. | |
71 char* const in = start; | |
72 char* out = start; | |
73 char* const end = start + len; | |
74 while (start < end) { | |
75 int good = UniLib::SpanInterchangeValid(start, end - start); | |
76 if (good > 0) { | |
77 if (out != start) { | |
78 memmove(out, start, good); | |
79 } | |
80 out += good; | |
81 start += good; | |
82 if (start == end) { | |
83 break; | |
84 } | |
85 } | |
86 // Is the current string invalid UTF8 or just non-interchange UTF8? | |
87 char32 rune; | |
88 int n; | |
89 if (isvalidcharntorune(start, end - start, &rune, &n)) { | |
90 // structurally valid UTF8, but not interchange valid | |
91 start += n; // Skip over the whole character. | |
92 } else { // bad UTF8 | |
93 start += 1; // Skip over just one byte | |
94 } | |
95 *out++ = ' '; | |
96 } | |
97 return out - in; | |
98 } | |
99 | |
100 | |
101 // *************** Data representation ********** | |
102 | |
103 // Note: the copy constructor is undefined. | |
104 | |
105 // After reserve(), resize(), or clear(), we're an owner, not an alias. | |
106 | |
107 void UnicodeText::Repr::reserve(int new_capacity) { | |
108 // If there's already enough capacity, and we're an owner, do nothing. | |
109 if (capacity_ >= new_capacity && ours_) return; | |
110 | |
111 // Otherwise, allocate a new buffer. | |
112 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20); | |
113 char* new_data = new char[capacity_]; | |
114 | |
115 // If there is an old buffer, copy it into the new buffer. | |
116 if (data_) { | |
117 memcpy(new_data, data_, size_); | |
118 if (ours_) delete[] data_; // If we owned the old buffer, free it. | |
119 } | |
120 data_ = new_data; | |
121 ours_ = true; // We own the new buffer. | |
122 // size_ is unchanged. | |
123 } | |
124 | |
125 void UnicodeText::Repr::resize(int new_size) { | |
126 if (new_size == 0) { | |
127 clear(); | |
128 } else { | |
129 if (!ours_ || new_size > capacity_) reserve(new_size); | |
130 // Clear the memory in the expanded part. | |
131 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); | |
132 size_ = new_size; | |
133 ours_ = true; | |
134 } | |
135 } | |
136 | |
137 // This implementation of clear() deallocates the buffer if we're an owner. | |
138 // That's not strictly necessary; we could just set size_ to 0. | |
139 void UnicodeText::Repr::clear() { | |
140 if (ours_) delete[] data_; | |
141 data_ = NULL; | |
142 size_ = capacity_ = 0; | |
143 ours_ = true; | |
144 } | |
145 | |
146 void UnicodeText::Repr::Copy(const char* data, int size) { | |
147 resize(size); | |
148 memcpy(data_, data, size); | |
149 } | |
150 | |
151 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { | |
152 if (data == data_) return; // We already own this memory. (Weird case.) | |
153 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. | |
154 data_ = data; | |
155 size_ = size; | |
156 capacity_ = capacity; | |
157 ours_ = true; | |
158 } | |
159 | |
160 void UnicodeText::Repr::PointTo(const char* data, int size) { | |
161 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. | |
162 data_ = const_cast<char*>(data); | |
163 size_ = size; | |
164 capacity_ = size; | |
165 ours_ = false; | |
166 } | |
167 | |
168 void UnicodeText::Repr::append(const char* bytes, int byte_length) { | |
169 reserve(size_ + byte_length); | |
170 memcpy(data_ + size_, bytes, byte_length); | |
171 size_ += byte_length; | |
172 } | |
173 | |
174 string UnicodeText::Repr::DebugString() const { | |
175 stringstream ss; | |
176 | |
177 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec | |
178 << size_ << " capacity=" << capacity_ << " " | |
179 << (ours_ ? "Owned" : "Alias") << "}"; | |
180 | |
181 string result; | |
182 ss >> result; | |
183 | |
184 return result; | |
185 } | |
186 | |
187 | |
188 | |
189 // *************** UnicodeText ****************** | |
190 | |
191 // ----- Constructors ----- | |
192 | |
193 // Default constructor | |
194 UnicodeText::UnicodeText() { | |
195 } | |
196 | |
197 // Copy constructor | |
198 UnicodeText::UnicodeText(const UnicodeText& src) { | |
199 Copy(src); | |
200 } | |
201 | |
202 // Substring constructor | |
203 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, | |
204 const UnicodeText::const_iterator& last) { | |
205 assert(first <= last && "Incompatible iterators"); | |
206 repr_.append(first.it_, last.it_ - first.it_); | |
207 } | |
208 | |
209 string UnicodeText::UTF8Substring(const const_iterator& first, | |
210 const const_iterator& last) { | |
211 assert(first <= last && "Incompatible iterators"); | |
212 return string(first.it_, last.it_ - first.it_); | |
213 } | |
214 | |
215 | |
216 // ----- Copy ----- | |
217 | |
218 UnicodeText& UnicodeText::operator=(const UnicodeText& src) { | |
219 if (this != &src) { | |
220 Copy(src); | |
221 } | |
222 return *this; | |
223 } | |
224 | |
225 UnicodeText& UnicodeText::Copy(const UnicodeText& src) { | |
226 repr_.Copy(src.repr_.data_, src.repr_.size_); | |
227 return *this; | |
228 } | |
229 | |
230 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { | |
231 repr_.Copy(buffer, byte_length); | |
232 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { | |
233 cerr << "UTF-8 buffer is not interchange-valid." << endl; | |
234 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); | |
235 } | |
236 return *this; | |
237 } | |
238 | |
239 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, | |
240 int byte_length) { | |
241 repr_.Copy(buffer, byte_length); | |
242 return *this; | |
243 } | |
244 | |
245 // ----- TakeOwnershipOf ----- | |
246 | |
247 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, | |
248 int byte_length, | |
249 int byte_capacity) { | |
250 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); | |
251 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { | |
252 cerr << "UTF-8 buffer is not interchange-valid." << endl; | |
253 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); | |
254 } | |
255 return *this; | |
256 } | |
257 | |
258 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, | |
259 int byte_length, | |
260 int byte_capacity) { | |
261 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); | |
262 return *this; | |
263 } | |
264 | |
265 // ----- PointTo ----- | |
266 | |
267 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { | |
268 if (UniLib:: IsInterchangeValid(buffer, byte_length)) { | |
269 repr_.PointTo(buffer, byte_length); | |
270 } else { | |
271 cerr << "UTF-8 buffer is not interchange-valid." << endl; | |
272 repr_.Copy(buffer, byte_length); | |
273 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); | |
274 } | |
275 return *this; | |
276 } | |
277 | |
278 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, | |
279 int byte_length) { | |
280 repr_.PointTo(buffer, byte_length); | |
281 return *this; | |
282 } | |
283 | |
284 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { | |
285 repr_.PointTo(src.repr_.data_, src.repr_.size_); | |
286 return *this; | |
287 } | |
288 | |
289 UnicodeText& UnicodeText::PointTo(const const_iterator &first, | |
290 const const_iterator &last) { | |
291 assert(first <= last && " Incompatible iterators"); | |
292 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); | |
293 return *this; | |
294 } | |
295 | |
296 // ----- Append ----- | |
297 | |
298 UnicodeText& UnicodeText::append(const UnicodeText& u) { | |
299 repr_.append(u.repr_.data_, u.repr_.size_); | |
300 return *this; | |
301 } | |
302 | |
303 UnicodeText& UnicodeText::append(const const_iterator& first, | |
304 const const_iterator& last) { | |
305 assert(first <= last && "Incompatible iterators"); | |
306 repr_.append(first.it_, last.it_ - first.it_); | |
307 return *this; | |
308 } | |
309 | |
310 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { | |
311 repr_.append(utf8, len); | |
312 return *this; | |
313 } | |
314 | |
315 // ----- substring searching ----- | |
316 | |
317 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, | |
318 const_iterator start_pos) const { | |
319 assert(start_pos.utf8_data() >= utf8_data()); | |
320 assert(start_pos.utf8_data() <= utf8_data() + utf8_length()); | |
321 return UnsafeFind(look, start_pos); | |
322 } | |
323 | |
324 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { | |
325 return UnsafeFind(look, begin()); | |
326 } | |
327 | |
328 UnicodeText::const_iterator UnicodeText::UnsafeFind( | |
329 const UnicodeText& look, const_iterator start_pos) const { | |
330 // Due to the magic of the UTF8 encoding, searching for a sequence of | |
331 // letters is equivalent to substring search. | |
332 StringPiece searching(utf8_data(), utf8_length()); | |
333 StringPiece look_piece(look.utf8_data(), look.utf8_length()); | |
334 StringPiece::size_type found = | |
335 searching.find(look_piece, start_pos.utf8_data() - utf8_data()); | |
336 if (found == StringPiece::npos) return end(); | |
337 return const_iterator(utf8_data() + found); | |
338 } | |
339 | |
340 bool UnicodeText::HasReplacementChar() const { | |
341 // Equivalent to: | |
342 // UnicodeText replacement_char; | |
343 // replacement_char.push_back(0xFFFD); | |
344 // return find(replacement_char) != end(); | |
345 StringPiece searching(utf8_data(), utf8_length()); | |
346 StringPiece looking_for("\xEF\xBF\xBD", 3); | |
347 return searching.find(looking_for) != StringPiece::npos; | |
348 } | |
349 | |
350 // ----- other methods ----- | |
351 | |
352 // Clear operator | |
353 void UnicodeText::clear() { | |
354 repr_.clear(); | |
355 } | |
356 | |
357 // Destructor | |
358 UnicodeText::~UnicodeText() {} | |
359 | |
360 | |
361 void UnicodeText::push_back(char32 c) { | |
362 if (UniLib::IsValidCodepoint(c)) { | |
363 char buf[UTFmax]; | |
364 int len = runetochar(buf, &c); | |
365 if (UniLib::IsInterchangeValid(buf, len)) { | |
366 repr_.append(buf, len); | |
367 } else { | |
368 cerr << "Unicode value 0x" << hex << c | |
369 << " is not valid for interchange" << endl; | |
370 repr_.append(" ", 1); | |
371 } | |
372 } else { | |
373 cerr << "Illegal Unicode value: 0x" << hex << c << endl; | |
374 repr_.append(" ", 1); | |
375 } | |
376 } | |
377 | |
378 int UnicodeText::size() const { | |
379 return CodepointCount(repr_.data_, repr_.size_); | |
380 } | |
381 | |
382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { | |
383 if (&lhs == &rhs) return true; | |
384 if (lhs.repr_.size_ != rhs.repr_.size_) return false; | |
385 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; | |
386 } | |
387 | |
388 string UnicodeText::DebugString() const { | |
389 stringstream ss; | |
390 | |
391 ss << "{UnicodeText " << hex << this << dec << " chars=" | |
392 << size() << " repr=" << repr_.DebugString() << "}"; | |
393 #if 0 | |
394 return StringPrintf("{UnicodeText %p chars=%d repr=%s}", | |
395 this, | |
396 size(), | |
397 repr_.DebugString().c_str()); | |
398 #endif | |
399 string result; | |
400 ss >> result; | |
401 | |
402 return result; | |
403 } | |
404 | |
405 | |
406 // ******************* UnicodeText::const_iterator ********************* | |
407 | |
408 // The implementation of const_iterator would be nicer if it | |
409 // inherited from boost::iterator_facade | |
410 // (http://boost.org/libs/iterator/doc/iterator_facade.html). | |
411 | |
412 UnicodeText::const_iterator::const_iterator() : it_(0) {} | |
413 | |
414 UnicodeText::const_iterator::const_iterator(const const_iterator& other) | |
415 : it_(other.it_) { | |
416 } | |
417 | |
418 UnicodeText::const_iterator& | |
419 UnicodeText::const_iterator::operator=(const const_iterator& other) { | |
420 if (&other != this) | |
421 it_ = other.it_; | |
422 return *this; | |
423 } | |
424 | |
425 UnicodeText::const_iterator UnicodeText::begin() const { | |
426 return const_iterator(repr_.data_); | |
427 } | |
428 | |
429 UnicodeText::const_iterator UnicodeText::end() const { | |
430 return const_iterator(repr_.data_ + repr_.size_); | |
431 } | |
432 | |
433 bool operator<(const UnicodeText::const_iterator& lhs, | |
434 const UnicodeText::const_iterator& rhs) { | |
435 return lhs.it_ < rhs.it_; | |
436 } | |
437 | |
438 char32 UnicodeText::const_iterator::operator*() const { | |
439 // (We could call chartorune here, but that does some | |
440 // error-checking, and we're guaranteed that our data is valid | |
441 // UTF-8. Also, we expect this routine to be called very often. So | |
442 // for speed, we do the calculation ourselves.) | |
443 | |
444 // Convert from UTF-8 | |
445 int byte1 = it_[0]; | |
446 if (byte1 < 0x80) | |
447 return byte1; | |
448 | |
449 int byte2 = it_[1]; | |
450 if (byte1 < 0xE0) | |
451 return ((byte1 & 0x1F) << 6) | |
452 | (byte2 & 0x3F); | |
453 | |
454 int byte3 = it_[2]; | |
455 if (byte1 < 0xF0) | |
456 return ((byte1 & 0x0F) << 12) | |
457 | ((byte2 & 0x3F) << 6) | |
458 | (byte3 & 0x3F); | |
459 | |
460 int byte4 = it_[3]; | |
461 return ((byte1 & 0x07) << 18) | |
462 | ((byte2 & 0x3F) << 12) | |
463 | ((byte3 & 0x3F) << 6) | |
464 | (byte4 & 0x3F); | |
465 } | |
466 | |
467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { | |
468 it_ += UniLib::OneCharLen(it_); | |
469 return *this; | |
470 } | |
471 | |
472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { | |
473 while (UniLib::IsTrailByte(*--it_)); | |
474 return *this; | |
475 } | |
476 | |
477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { | |
478 utf8_output[0] = it_[0]; | |
479 if (static_cast<unsigned char>(it_[0]) < 0x80) | |
480 return 1; | |
481 | |
482 utf8_output[1] = it_[1]; | |
483 if (static_cast<unsigned char>(it_[0]) < 0xE0) | |
484 return 2; | |
485 | |
486 utf8_output[2] = it_[2]; | |
487 if (static_cast<unsigned char>(it_[0]) < 0xF0) | |
488 return 3; | |
489 | |
490 utf8_output[3] = it_[3]; | |
491 return 4; | |
492 } | |
493 | |
494 | |
495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { | |
496 assert(p != NULL); | |
497 const char* start = utf8_data(); | |
498 int len = utf8_length(); | |
499 const char* end = start + len; | |
500 assert(p >= start); | |
501 assert(p <= end); | |
502 assert(p == end || !UniLib::IsTrailByte(*p)); | |
503 return const_iterator(p); | |
504 } | |
505 | |
506 string UnicodeText::const_iterator::DebugString() const { | |
507 stringstream ss; | |
508 | |
509 ss << "{iter " << hex << it_ << "}"; | |
510 string result; | |
511 ss >> result; | |
512 | |
513 return result; | |
514 } | |
515 | |
OLD | NEW |