third_party/libphonenumber/cpp/src/utf/unicodetext.cc - Issue 6920006: Revert 84000 - Autofill phone number enhancements and integration of Phone Number Util Library: p...

Side by Side Diff: third_party/libphonenumber/cpp/src/utf/unicodetext.cc

Issue 6920006: Revert 84000 - Autofill phone number enhancements and integration of Phone Number Util Library: p... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 9 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (C) 2006 Google Inc.

2 //

3 // Licensed under the Apache License, Version 2.0 (the "License");

4 // you may not use this file except in compliance with the License.

5 // You may obtain a copy of the License at

6 //

7 // http://www.apache.org/licenses/LICENSE-2.0

8 //

9 // Unless required by applicable law or agreed to in writing, software

10 // distributed under the License is distributed on an "AS IS" BASIS,

11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12 // See the License for the specific language governing permissions and

13 // limitations under the License.

14

15 // Author: Jim Meehan

16

17 #include <iostream>

18 #include <sstream>

19 #include <cassert>

20

21 #include "utf/unicodetext.h"

22 //#include "base/logging.h"

23 #include "utf/stringpiece.h"

24 //#include "utf/stringprintf.h"

25 #include "utf/utf.h"

26 #include "utf/unilib.h"

27

28 using std::stringstream;

29 using std::max;

30 using std::hex;

31 using std::dec;

32 using std::cerr;

33 using std::endl;

34

35 static int CodepointDistance(const char* start, const char* end) {

36 int n = 0;

37 // Increment n on every non-trail-byte.

38 for (const char* p = start; p < end; ++p) {

39 n += (reinterpret_cast<const signed char>(p) >= -0x40);

40 }

41 return n;

42 }

43

44 static int CodepointCount(const char* utf8, int len) {

45 return CodepointDistance(utf8, utf8 + len);

46 }

47

48 UnicodeText::const_iterator::difference_type

49 distance(const UnicodeText::const_iterator& first,

50 const UnicodeText::const_iterator& last) {

51 return CodepointDistance(first.it_, last.it_);

52 }

53

54 // ---------- Utility ----------

55

56 static int ConvertToInterchangeValid(char* start, int len) {

57 // This routine is called only when we've discovered that a UTF-8 buffer

58 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8

59 // was not interchange valid. This indicates a bug in the caller, and

60 // a LOG(WARNING) is done in that case.

61 // This is similar to CoerceToInterchangeValid, but it replaces each

62 // structurally valid byte with a space, and each non-interchange

63 // character with a space, even when that character requires more

64 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is

65 // structurally valid UTF8, but U+FDD0 is not an interchange-valid

66 // code point. The result should contain one space, not three.

67 //

68 // Since the conversion never needs to write more data than it

69 // reads, it is safe to change the buffer in place. It returns the

70 // number of bytes written.

71 char* const in = start;

72 char* out = start;

73 char* const end = start + len;

74 while (start < end) {

75 int good = UniLib::SpanInterchangeValid(start, end - start);

76 if (good > 0) {

77 if (out != start) {

78 memmove(out, start, good);

79 }

80 out += good;

81 start += good;

82 if (start == end) {

83 break;

84 }

85 }

86 // Is the current string invalid UTF8 or just non-interchange UTF8?

87 char32 rune;

88 int n;

89 if (isvalidcharntorune(start, end - start, &rune, &n)) {

90 // structurally valid UTF8, but not interchange valid

91 start += n; // Skip over the whole character.

92 } else { // bad UTF8

93 start += 1; // Skip over just one byte

94 }

95 *out++ = ' ';

96 }

97 return out - in;

98 }

99

100

101 // ************* Data representation ********

102

103 // Note: the copy constructor is undefined.

104

105 // After reserve(), resize(), or clear(), we're an owner, not an alias.

106

107 void UnicodeText::Repr::reserve(int new_capacity) {

108 // If there's already enough capacity, and we're an owner, do nothing.

109 if (capacity_ >= new_capacity && ours_) return;

110

111 // Otherwise, allocate a new buffer.

112 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);

113 char* new_data = new char[capacity_];

114

115 // If there is an old buffer, copy it into the new buffer.

116 if (data_) {

117 memcpy(new_data, data_, size_);

118 if (ours_) delete[] data_; // If we owned the old buffer, free it.

119 }

120 data_ = new_data;

121 ours_ = true; // We own the new buffer.

122 // size_ is unchanged.

123 }

124

125 void UnicodeText::Repr::resize(int new_size) {

126 if (new_size == 0) {

127 clear();

128 } else {

129 if (!ours_ \|\| new_size > capacity_) reserve(new_size);

130 // Clear the memory in the expanded part.

131 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);

132 size_ = new_size;

133 ours_ = true;

134 }

135 }

136

137 // This implementation of clear() deallocates the buffer if we're an owner.

138 // That's not strictly necessary; we could just set size_ to 0.

139 void UnicodeText::Repr::clear() {

140 if (ours_) delete[] data_;

141 data_ = NULL;

142 size_ = capacity_ = 0;

143 ours_ = true;

144 }

145

146 void UnicodeText::Repr::Copy(const char* data, int size) {

147 resize(size);

148 memcpy(data_, data, size);

149 }

150

151 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {

152 if (data == data_) return; // We already own this memory. (Weird case.)

153 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.

154 data_ = data;

155 size_ = size;

156 capacity_ = capacity;

157 ours_ = true;

158 }

159

160 void UnicodeText::Repr::PointTo(const char* data, int size) {

161 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.

162 data_ = const_cast<char*>(data);

163 size_ = size;

164 capacity_ = size;

165 ours_ = false;

166 }

167

168 void UnicodeText::Repr::append(const char* bytes, int byte_length) {

169 reserve(size_ + byte_length);

170 memcpy(data_ + size_, bytes, byte_length);

171 size_ += byte_length;

172 }

173

174 string UnicodeText::Repr::DebugString() const {

175 stringstream ss;

176

177 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec

178 << size_ << " capacity=" << capacity_ << " "

179 << (ours_ ? "Owned" : "Alias") << "}";

180

181 string result;

182 ss >> result;

183

184 return result;

185 }

186

187

188

189 // ************* UnicodeText ****************

190

191 // ----- Constructors -----

192

193 // Default constructor

194 UnicodeText::UnicodeText() {

195 }

196

197 // Copy constructor

198 UnicodeText::UnicodeText(const UnicodeText& src) {

199 Copy(src);

200 }

201

202 // Substring constructor

203 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,

204 const UnicodeText::const_iterator& last) {

205 assert(first <= last && "Incompatible iterators");

206 repr_.append(first.it_, last.it_ - first.it_);

207 }

208

209 string UnicodeText::UTF8Substring(const const_iterator& first,

210 const const_iterator& last) {

211 assert(first <= last && "Incompatible iterators");

212 return string(first.it_, last.it_ - first.it_);

213 }

214

215

216 // ----- Copy -----

217

218 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {

219 if (this != &src) {

220 Copy(src);

221 }

222 return *this;

223 }

224

225 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {

226 repr_.Copy(src.repr_.data_, src.repr_.size_);

227 return *this;

228 }

229

230 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {

231 repr_.Copy(buffer, byte_length);

232 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {

233 cerr << "UTF-8 buffer is not interchange-valid." << endl;

234 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

235 }

236 return *this;

237 }

238

239 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,

240 int byte_length) {

241 repr_.Copy(buffer, byte_length);

242 return *this;

243 }

244

245 // ----- TakeOwnershipOf -----

246

247 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,

248 int byte_length,

249 int byte_capacity) {

250 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

251 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {

252 cerr << "UTF-8 buffer is not interchange-valid." << endl;

253 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

254 }

255 return *this;

256 }

257

258 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,

259 int byte_length,

260 int byte_capacity) {

261 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);

262 return *this;

263 }

264

265 // ----- PointTo -----

266

267 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {

268 if (UniLib:: IsInterchangeValid(buffer, byte_length)) {

269 repr_.PointTo(buffer, byte_length);

270 } else {

271 cerr << "UTF-8 buffer is not interchange-valid." << endl;

272 repr_.Copy(buffer, byte_length);

273 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);

274 }

275 return *this;

276 }

277

278 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,

279 int byte_length) {

280 repr_.PointTo(buffer, byte_length);

281 return *this;

282 }

283

284 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {

285 repr_.PointTo(src.repr_.data_, src.repr_.size_);

286 return *this;

287 }

288

289 UnicodeText& UnicodeText::PointTo(const const_iterator &first,

290 const const_iterator &last) {

291 assert(first <= last && " Incompatible iterators");

292 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());

293 return *this;

294 }

295

296 // ----- Append -----

297

298 UnicodeText& UnicodeText::append(const UnicodeText& u) {

299 repr_.append(u.repr_.data_, u.repr_.size_);

300 return *this;

301 }

302

303 UnicodeText& UnicodeText::append(const const_iterator& first,

304 const const_iterator& last) {

305 assert(first <= last && "Incompatible iterators");

306 repr_.append(first.it_, last.it_ - first.it_);

307 return *this;

308 }

309

310 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {

311 repr_.append(utf8, len);

312 return *this;

313 }

314

315 // ----- substring searching -----

316

317 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,

318 const_iterator start_pos) const {

319 assert(start_pos.utf8_data() >= utf8_data());

320 assert(start_pos.utf8_data() <= utf8_data() + utf8_length());

321 return UnsafeFind(look, start_pos);

322 }

323

324 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {

325 return UnsafeFind(look, begin());

326 }

327

328 UnicodeText::const_iterator UnicodeText::UnsafeFind(

329 const UnicodeText& look, const_iterator start_pos) const {

330 // Due to the magic of the UTF8 encoding, searching for a sequence of

331 // letters is equivalent to substring search.

332 StringPiece searching(utf8_data(), utf8_length());

333 StringPiece look_piece(look.utf8_data(), look.utf8_length());

334 StringPiece::size_type found =

335 searching.find(look_piece, start_pos.utf8_data() - utf8_data());

336 if (found == StringPiece::npos) return end();

337 return const_iterator(utf8_data() + found);

338 }

339

340 bool UnicodeText::HasReplacementChar() const {

341 // Equivalent to:

342 // UnicodeText replacement_char;

343 // replacement_char.push_back(0xFFFD);

344 // return find(replacement_char) != end();

345 StringPiece searching(utf8_data(), utf8_length());

346 StringPiece looking_for("\xEF\xBF\xBD", 3);

347 return searching.find(looking_for) != StringPiece::npos;

348 }

349

350 // ----- other methods -----

351

352 // Clear operator

353 void UnicodeText::clear() {

354 repr_.clear();

355 }

356

357 // Destructor

358 UnicodeText::~UnicodeText() {}

359

360

361 void UnicodeText::push_back(char32 c) {

362 if (UniLib::IsValidCodepoint(c)) {

363 char buf[UTFmax];

364 int len = runetochar(buf, &c);

365 if (UniLib::IsInterchangeValid(buf, len)) {

366 repr_.append(buf, len);

367 } else {

368 cerr << "Unicode value 0x" << hex << c

369 << " is not valid for interchange" << endl;

370 repr_.append(" ", 1);

371 }

372 } else {

373 cerr << "Illegal Unicode value: 0x" << hex << c << endl;

374 repr_.append(" ", 1);

375 }

376 }

377

378 int UnicodeText::size() const {

379 return CodepointCount(repr_.data_, repr_.size_);

380 }

381

382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {

383 if (&lhs == &rhs) return true;

384 if (lhs.repr_.size_ != rhs.repr_.size_) return false;

385 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;

386 }

387

388 string UnicodeText::DebugString() const {

389 stringstream ss;

390

391 ss << "{UnicodeText " << hex << this << dec << " chars="

392 << size() << " repr=" << repr_.DebugString() << "}";

393 #if 0

394 return StringPrintf("{UnicodeText %p chars=%d repr=%s}",

395 this,

396 size(),

397 repr_.DebugString().c_str());

398 #endif

399 string result;

400 ss >> result;

401

402 return result;

403 }

404

405

406 // ***************** UnicodeText::const_iterator *******************

407

408 // The implementation of const_iterator would be nicer if it

409 // inherited from boost::iterator_facade

410 // (http://boost.org/libs/iterator/doc/iterator_facade.html).

411

412 UnicodeText::const_iterator::const_iterator() : it_(0) {}

413

414 UnicodeText::const_iterator::const_iterator(const const_iterator& other)

415 : it_(other.it_) {

416 }

417

418 UnicodeText::const_iterator&

419 UnicodeText::const_iterator::operator=(const const_iterator& other) {

420 if (&other != this)

421 it_ = other.it_;

422 return *this;

423 }

424

425 UnicodeText::const_iterator UnicodeText::begin() const {

426 return const_iterator(repr_.data_);

427 }

428

429 UnicodeText::const_iterator UnicodeText::end() const {

430 return const_iterator(repr_.data_ + repr_.size_);

431 }

432

433 bool operator<(const UnicodeText::const_iterator& lhs,

434 const UnicodeText::const_iterator& rhs) {

435 return lhs.it_ < rhs.it_;

436 }

437

438 char32 UnicodeText::const_iterator::operator*() const {

439 // (We could call chartorune here, but that does some

440 // error-checking, and we're guaranteed that our data is valid

441 // UTF-8. Also, we expect this routine to be called very often. So

442 // for speed, we do the calculation ourselves.)

443

444 // Convert from UTF-8

445 int byte1 = it_[0];

446 if (byte1 < 0x80)

447 return byte1;

448

449 int byte2 = it_[1];

450 if (byte1 < 0xE0)

451 return ((byte1 & 0x1F) << 6)

452 \| (byte2 & 0x3F);

453

454 int byte3 = it_[2];

455 if (byte1 < 0xF0)

456 return ((byte1 & 0x0F) << 12)

457 \| ((byte2 & 0x3F) << 6)

458 \| (byte3 & 0x3F);

459

460 int byte4 = it_[3];

461 return ((byte1 & 0x07) << 18)

462 \| ((byte2 & 0x3F) << 12)

463 \| ((byte3 & 0x3F) << 6)

464 \| (byte4 & 0x3F);

465 }

466

467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {

468 it_ += UniLib::OneCharLen(it_);

469 return *this;

470 }

471

472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {

473 while (UniLib::IsTrailByte(*--it_));

474 return *this;

475 }

476

477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {

478 utf8_output[0] = it_[0];

479 if (static_cast<unsigned char>(it_[0]) < 0x80)

480 return 1;

481

482 utf8_output[1] = it_[1];

483 if (static_cast<unsigned char>(it_[0]) < 0xE0)

484 return 2;

485

486 utf8_output[2] = it_[2];

487 if (static_cast<unsigned char>(it_[0]) < 0xF0)

488 return 3;

489

490 utf8_output[3] = it_[3];

491 return 4;

492 }

493

494

495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {

496 assert(p != NULL);

497 const char* start = utf8_data();

498 int len = utf8_length();

499 const char* end = start + len;

500 assert(p >= start);

501 assert(p <= end);

502 assert(p == end \|\| !UniLib::IsTrailByte(*p));

503 return const_iterator(p);

504 }

505

506 string UnicodeText::const_iterator::DebugString() const {

507 stringstream ss;

508

509 ss << "{iter " << hex << it_ << "}";

510 string result;

511 ss >> result;

512

513 return result;

514 }

515

OLD	NEW

« no previous file with comments | « third_party/libphonenumber/cpp/src/utf/unicodetext.h ('k') | third_party/libphonenumber/cpp/src/utf/unilib.h » ('j') | no next file with comments »