| Index: third_party/libphonenumber/cpp/src/utf/unicodetext.h
 | 
| ===================================================================
 | 
| --- third_party/libphonenumber/cpp/src/utf/unicodetext.h	(revision 0)
 | 
| +++ third_party/libphonenumber/cpp/src/utf/unicodetext.h	(revision 0)
 | 
| @@ -0,0 +1,456 @@
 | 
| +// Copyright (C) 2006 Google Inc.
 | 
| +//
 | 
| +// Licensed under the Apache License, Version 2.0 (the "License");
 | 
| +// you may not use this file except in compliance with the License.
 | 
| +// You may obtain a copy of the License at
 | 
| +//
 | 
| +// http://www.apache.org/licenses/LICENSE-2.0
 | 
| +//
 | 
| +// Unless required by applicable law or agreed to in writing, software
 | 
| +// distributed under the License is distributed on an "AS IS" BASIS,
 | 
| +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
| +// See the License for the specific language governing permissions and
 | 
| +// limitations under the License.
 | 
| +
 | 
| +// Author: Jim Meehan
 | 
| +
 | 
| +#ifndef UTIL_UTF8_UNICODETEXT_H__
 | 
| +#define UTIL_UTF8_UNICODETEXT_H__
 | 
| +
 | 
| +#include <iterator>
 | 
| +#include <string>
 | 
| +#include <utility>
 | 
| +#include "base/basictypes.h"
 | 
| +//#include "util/utf8/public/config.h"
 | 
| +
 | 
| +using std::string;
 | 
| +using std::bidirectional_iterator_tag;
 | 
| +using std::pair;
 | 
| +
 | 
| +// ***************************** UnicodeText **************************
 | 
| +//
 | 
| +// A UnicodeText object is a container for a sequence of Unicode
 | 
| +// codepoint values. It has default, copy, and assignment constructors.
 | 
| +// Data can be appended to it from another UnicodeText, from
 | 
| +// iterators, or from a single codepoint.
 | 
| +//
 | 
| +// The internal representation of the text is UTF-8. Since UTF-8 is a
 | 
| +// variable-width format, UnicodeText does not provide random access
 | 
| +// to the text, and changes to the text are permitted only at the end.
 | 
| +//
 | 
| +// The UnicodeText class defines a const_iterator. The dereferencing
 | 
| +// operator (*) returns a codepoint (char32). The iterator is a
 | 
| +// bidirectional, read-only iterator. It becomes invalid if the text
 | 
| +// is changed.
 | 
| +//
 | 
| +// There are methods for appending and retrieving UTF-8 data directly.
 | 
| +// The 'utf8_data' method returns a const char* that contains the
 | 
| +// UTF-8-encoded version of the text; 'utf8_length' returns the number
 | 
| +// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
 | 
| +// 4 bytes of UTF-8 data in a char array and returns the number of
 | 
| +// bytes that it stored.
 | 
| +//
 | 
| +// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
 | 
| +// 0x10FFFF], but UnicodeText has the additional restriction that it
 | 
| +// can contain only those characters that are valid for interchange on
 | 
| +// the Web. This excludes all of the control codes except for carriage
 | 
| +// return, line feed, and horizontal tab.  It also excludes
 | 
| +// non-characters, but codepoints that are in the Private Use regions
 | 
| +// are allowed, as are codepoints that are unassigned. (See the
 | 
| +// Unicode reference for details.) The function UniLib::IsInterchangeValid
 | 
| +// can be used as a test for this property.
 | 
| +//
 | 
| +// UnicodeTexts are safe. Every method that constructs or modifies a
 | 
| +// UnicodeText tests for interchange-validity, and will substitute a
 | 
| +// space for the invalid data. Such cases are reported via
 | 
| +// LOG(WARNING).
 | 
| +//
 | 
| +// MEMORY MANAGEMENT: copy, take ownership, or point to
 | 
| +//
 | 
| +// A UnicodeText is either an "owner", meaning that it owns the memory
 | 
| +// for the data buffer and will free it when the UnicodeText is
 | 
| +// destroyed, or it is an "alias", meaning that it does not.
 | 
| +//
 | 
| +// There are three methods for storing UTF-8 data in a UnicodeText:
 | 
| +//
 | 
| +// CopyUTF8(buffer, len) copies buffer.
 | 
| +//
 | 
| +// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
 | 
| +//
 | 
| +// PointToUTF8(buffer, size) creates an alias pointing to buffer.
 | 
| +//
 | 
| +// All three methods perform a validity check on the buffer. There are
 | 
| +// private, "unsafe" versions of these functions that bypass the
 | 
| +// validity check. They are used internally and by friend-functions
 | 
| +// that are handling UTF-8 data that has already been validated.
 | 
| +//
 | 
| +// The purpose of an alias is to avoid making an unnecessary copy of a
 | 
| +// UTF-8 buffer while still providing access to the Unicode values
 | 
| +// within that text through iterators or the fast scanners that are
 | 
| +// based on UTF-8 state tables. The lifetime of an alias must not
 | 
| +// exceed the lifetime of the buffer from which it was constructed.
 | 
| +//
 | 
| +// The semantics of an alias might be described as "copy on write or
 | 
| +// repair." The source data is never modified. If push_back() or
 | 
| +// append() is called on an alias, a copy of the data will be created,
 | 
| +// and the UnicodeText will become an owner. If clear() is called on
 | 
| +// an alias, it becomes an (empty) owner.
 | 
| +//
 | 
| +// The copy constructor and the assignment operator produce an owner.
 | 
| +// That is, after direct initialization ("UnicodeText x(y);") or copy
 | 
| +// initialization ("UnicodeText x = y;") x will be an owner, even if y
 | 
| +// was an alias. The assignment operator ("x = y;") also produces an
 | 
| +// owner unless x and y are the same object and y is an alias.
 | 
| +//
 | 
| +// Aliases should be used with care. If the source from which an alias
 | 
| +// was created is freed, or if the contents are changed, while the
 | 
| +// alias is still in use, fatal errors could result. But it can be
 | 
| +// quite useful to have a UnicodeText "window" through which to see a
 | 
| +// UTF-8 buffer without having to pay the price of making a copy.
 | 
| +//
 | 
| +// UTILITIES
 | 
| +//
 | 
| +// The interfaces in util/utf8/public/textutils.h provide higher-level
 | 
| +// utilities for dealing with UnicodeTexts, including routines for
 | 
| +// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
 | 
| +// strings, creating strings from UnicodeTexts, normalizing text for
 | 
| +// efficient matching or display, and others.
 | 
| +
 | 
| +class UnicodeText {
 | 
| + public:
 | 
| +  class const_iterator;
 | 
| +
 | 
| +  typedef char32 value_type;
 | 
| +
 | 
| +  // Constructors. These always produce owners.
 | 
| +  UnicodeText();  // Create an empty text.
 | 
| +  UnicodeText(const UnicodeText& src);  // copy constructor
 | 
| +  // Construct a substring (copies the data).
 | 
| +  UnicodeText(const const_iterator& first, const const_iterator& last);
 | 
| +
 | 
| +  // Assignment operator. This copies the data and produces an owner
 | 
| +  // unless this == &src, e.g., "x = x;", which is a no-op.
 | 
| +  UnicodeText& operator=(const UnicodeText& src);
 | 
| +
 | 
| +  // x.Copy(y) copies the data from y into x.
 | 
| +  UnicodeText& Copy(const UnicodeText& src);
 | 
| +  inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
 | 
| +
 | 
| +  // x.PointTo(y) changes x so that it points to y's data.
 | 
| +  // It does not copy y or take ownership of y's data.
 | 
| +  UnicodeText& PointTo(const UnicodeText& src);
 | 
| +  UnicodeText& PointTo(const const_iterator& first,
 | 
| +                       const const_iterator& last);
 | 
| +
 | 
| +  ~UnicodeText();
 | 
| +
 | 
| +  void clear();  // Clear text.
 | 
| +  bool empty() { return repr_.size_ == 0; }  // Test if text is empty.
 | 
| +
 | 
| +  // Add a codepoint to the end of the text.
 | 
| +  // If the codepoint is not interchange-valid, add a space instead
 | 
| +  // and log a warning.
 | 
| +  void push_back(char32 codepoint);
 | 
| +
 | 
| +  // Generic appending operation.
 | 
| +  // iterator_traits<ForwardIterator>::value_type must be implicitly
 | 
| +  // convertible to char32. Typical uses of this method might include:
 | 
| +  //     char32 chars[] = {0x1, 0x2, ...};
 | 
| +  //     vector<char32> more_chars = ...;
 | 
| +  //     utext.append(chars, chars+arraysize(chars));
 | 
| +  //     utext.append(more_chars.begin(), more_chars.end());
 | 
| +  template<typename ForwardIterator>
 | 
| +  UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
 | 
| +    while (first != last) { push_back(*first++); }
 | 
| +    return *this;
 | 
| +  }
 | 
| +
 | 
| +  // A specialization of the generic append() method.
 | 
| +  UnicodeText& append(const const_iterator& first, const const_iterator& last);
 | 
| +
 | 
| +  // An optimization of append(source.begin(), source.end()).
 | 
| +  UnicodeText& append(const UnicodeText& source);
 | 
| +
 | 
| +  int size() const;  // the number of Unicode characters (codepoints)
 | 
| +
 | 
| +  friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
 | 
| +  friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
 | 
| +
 | 
| +  class const_iterator {
 | 
| +    typedef const_iterator CI;
 | 
| +   public:
 | 
| +    typedef bidirectional_iterator_tag iterator_category;
 | 
| +    typedef char32 value_type;
 | 
| +    typedef ptrdiff_t difference_type;
 | 
| +    typedef void pointer;  // (Not needed.)
 | 
| +    typedef const char32 reference;  // (Needed for const_reverse_iterator)
 | 
| +
 | 
| +    // Iterators are default-constructible.
 | 
| +    const_iterator();
 | 
| +
 | 
| +    // It's safe to make multiple passes over a UnicodeText.
 | 
| +    const_iterator(const const_iterator& other);
 | 
| +    const_iterator& operator=(const const_iterator& other);
 | 
| +
 | 
| +    char32 operator*() const;  // Dereference
 | 
| +
 | 
| +    const_iterator& operator++();  // Advance (++iter)
 | 
| +    const_iterator operator++(int) {  // (iter++)
 | 
| +      const_iterator result(*this);
 | 
| +      ++*this;
 | 
| +      return result;
 | 
| +    }
 | 
| +
 | 
| +    const_iterator& operator--();  // Retreat (--iter)
 | 
| +    const_iterator operator--(int) {  // (iter--)
 | 
| +      const_iterator result(*this);
 | 
| +      --*this;
 | 
| +      return result;
 | 
| +    }
 | 
| +
 | 
| +    // We love relational operators.
 | 
| +    friend bool operator==(const CI& lhs, const CI& rhs) {
 | 
| +      return lhs.it_ == rhs.it_; }
 | 
| +    friend bool operator!=(const CI& lhs, const CI& rhs) {
 | 
| +      return !(lhs == rhs); }
 | 
| +    friend bool operator<(const CI& lhs, const CI& rhs);
 | 
| +    friend bool operator>(const CI& lhs, const CI& rhs) {
 | 
| +      return rhs < lhs; }
 | 
| +    friend bool operator<=(const CI& lhs, const CI& rhs) {
 | 
| +      return !(rhs < lhs); }
 | 
| +    friend bool operator>=(const CI& lhs, const CI& rhs) {
 | 
| +      return !(lhs < rhs); }
 | 
| +
 | 
| +    friend difference_type distance(const CI& first, const CI& last);
 | 
| +
 | 
| +    // UTF-8-specific methods
 | 
| +    // Store the UTF-8 encoding of the current codepoint into buf,
 | 
| +    // which must be at least 4 bytes long. Return the number of
 | 
| +    // bytes written.
 | 
| +    int get_utf8(char* buf) const;
 | 
| +    // Return the iterator's pointer into the UTF-8 data.
 | 
| +    const char* utf8_data() const { return it_; }
 | 
| +
 | 
| +    string DebugString() const;
 | 
| +
 | 
| +   private:
 | 
| +    friend class UnicodeText;
 | 
| +    friend class UnicodeTextUtils;
 | 
| +    friend class UTF8StateTableProperty;
 | 
| +    explicit const_iterator(const char* it) : it_(it) {}
 | 
| +
 | 
| +    const char* it_;
 | 
| +  };
 | 
| +
 | 
| +  const_iterator begin() const;
 | 
| +  const_iterator end() const;
 | 
| +
 | 
| +  class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
 | 
| +   public:
 | 
| +    const_reverse_iterator(const_iterator it) :
 | 
| +        std::reverse_iterator<const_iterator>(it) {}
 | 
| +    const char* utf8_data() const {
 | 
| +      const_iterator tmp_it = base();
 | 
| +      return (--tmp_it).utf8_data();
 | 
| +    }
 | 
| +    int get_utf8(char* buf) const {
 | 
| +      const_iterator tmp_it = base();
 | 
| +      return (--tmp_it).get_utf8(buf);
 | 
| +    }
 | 
| +  };
 | 
| +  const_reverse_iterator rbegin() const {
 | 
| +    return const_reverse_iterator(end());
 | 
| +  }
 | 
| +  const_reverse_iterator rend() const {
 | 
| +    return const_reverse_iterator(begin());
 | 
| +  }
 | 
| +
 | 
| +  // Substring searching.  Returns the beginning of the first
 | 
| +  // occurrence of "look", or end() if not found.
 | 
| +  const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
 | 
| +  // Equivalent to find(look, begin())
 | 
| +  const_iterator find(const UnicodeText& look) const;
 | 
| +
 | 
| +  // Returns whether this contains the character U+FFFD.  This can
 | 
| +  // occur, for example, if the input to Encodings::Decode() had byte
 | 
| +  // sequences that were invalid in the source encoding.
 | 
| +  bool HasReplacementChar() const;
 | 
| +
 | 
| +  // UTF-8-specific methods
 | 
| +  //
 | 
| +  // Return the data, length, and capacity of UTF-8-encoded version of
 | 
| +  // the text. Length and capacity are measured in bytes.
 | 
| +  const char* utf8_data() const { return repr_.data_; }
 | 
| +  int utf8_length() const { return repr_.size_; }
 | 
| +  int utf8_capacity() const { return repr_.capacity_; }
 | 
| +
 | 
| +  // Return the UTF-8 data as a string.
 | 
| +  static string UTF8Substring(const const_iterator& first,
 | 
| +                              const const_iterator& last);
 | 
| +
 | 
| +  // There are three methods for initializing a UnicodeText from UTF-8
 | 
| +  // data. They vary in details of memory management. In all cases,
 | 
| +  // the data is tested for interchange-validity. If it is not
 | 
| +  // interchange-valid, a LOG(WARNING) is issued, and each
 | 
| +  // structurally invalid byte and each interchange-invalid codepoint
 | 
| +  // is replaced with a space.
 | 
| +
 | 
| +  // x.CopyUTF8(buf, len) copies buf into x.
 | 
| +  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
 | 
| +
 | 
| +  // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
 | 
| +  // buf. buf is not copied.
 | 
| +  UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
 | 
| +                                   int byte_length,
 | 
| +                                   int byte_capacity);
 | 
| +
 | 
| +  // x.PointToUTF8(buf,len) changes x so that it points to buf
 | 
| +  // ("becomes an alias"). It does not take ownership or copy buf.
 | 
| +  // If the buffer is not valid, this has the same effect as
 | 
| +  // CopyUTF8(utf8_buffer, byte_length).
 | 
| +  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
 | 
| +
 | 
| +  // Occasionally it is necessary to use functions that operate on the
 | 
| +  // pointer returned by utf8_data(). MakeIterator(p) provides a way
 | 
| +  // to get back to the UnicodeText level. It uses CHECK to ensure
 | 
| +  // that p is a pointer within this object's UTF-8 data, and that it
 | 
| +  // points to the beginning of a character.
 | 
| +  const_iterator MakeIterator(const char* p) const;
 | 
| +
 | 
| +  string DebugString() const;
 | 
| +
 | 
| + private:
 | 
| +  friend class const_iterator;
 | 
| +  friend class UnicodeTextUtils;
 | 
| +
 | 
| +  class Repr {  // A byte-string.
 | 
| +   public:
 | 
| +    char* data_;
 | 
| +    int size_;
 | 
| +    int capacity_;
 | 
| +    bool ours_;  // Do we own data_?
 | 
| +
 | 
| +    Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
 | 
| +    ~Repr() { if (ours_) delete[] data_; }
 | 
| +
 | 
| +    void clear();
 | 
| +    void reserve(int capacity);
 | 
| +    void resize(int size);
 | 
| +
 | 
| +    void append(const char* bytes, int byte_length);
 | 
| +    void Copy(const char* data, int size);
 | 
| +    void TakeOwnershipOf(char* data, int size, int capacity);
 | 
| +    void PointTo(const char* data, int size);
 | 
| +
 | 
| +    string DebugString() const;
 | 
| +
 | 
| +   private:
 | 
| +    Repr& operator=(const Repr&);
 | 
| +    Repr(const Repr& other);
 | 
| +  };
 | 
| +
 | 
| +  Repr repr_;
 | 
| +
 | 
| +  // UTF-8-specific private methods.
 | 
| +  // These routines do not perform a validity check when compiled
 | 
| +  // in opt mode.
 | 
| +  // It is an error to call these methods with UTF-8 data that
 | 
| +  // is not interchange-valid.
 | 
| +  //
 | 
| +  UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
 | 
| +  UnicodeText& UnsafeTakeOwnershipOfUTF8(
 | 
| +      char* utf8_buffer, int byte_length, int byte_capacity);
 | 
| +  UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
 | 
| +  UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
 | 
| +  const_iterator UnsafeFind(const UnicodeText& look,
 | 
| +                            const_iterator start_pos) const;
 | 
| +};
 | 
| +
 | 
| +bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
 | 
| +
 | 
| +inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
 | 
| +  return !(lhs == rhs);
 | 
| +}
 | 
| +
 | 
| +// UnicodeTextRange is a pair of iterators, useful for specifying text
 | 
| +// segments. If the iterators are ==, the segment is empty.
 | 
| +typedef pair<UnicodeText::const_iterator,
 | 
| +             UnicodeText::const_iterator> UnicodeTextRange;
 | 
| +
 | 
| +inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
 | 
| +  return r.first == r.second;
 | 
| +}
 | 
| +
 | 
| +
 | 
| +// *************************** Utilities *************************
 | 
| +
 | 
| +// A factory function for creating a UnicodeText from a buffer of
 | 
| +// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
 | 
| +// is an "owner.")
 | 
| +//
 | 
| +// Each byte that is structurally invalid will be replaced with a
 | 
| +// space. Each codepoint that is interchange-invalid will also be
 | 
| +// replaced with a space, even if the codepoint was represented with a
 | 
| +// multibyte sequence in the UTF-8 data.
 | 
| +//
 | 
| +inline UnicodeText MakeUnicodeTextAcceptingOwnership(
 | 
| +    char* utf8_buffer, int byte_length, int byte_capacity) {
 | 
| +  return UnicodeText().TakeOwnershipOfUTF8(
 | 
| +      utf8_buffer, byte_length, byte_capacity);
 | 
| +}
 | 
| +
 | 
| +// A factory function for creating a UnicodeText from a buffer of
 | 
| +// UTF-8 data. The new UnicodeText does not take ownership of the
 | 
| +// buffer. (It is an "alias.")
 | 
| +//
 | 
| +inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
 | 
| +    const char* utf8_buffer, int byte_length) {
 | 
| +  return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
 | 
| +}
 | 
| +
 | 
| +// Create a UnicodeText from a UTF-8 string or buffer.
 | 
| +//
 | 
| +// If do_copy is true, then a copy of the string is made. The copy is
 | 
| +// owned by the resulting UnicodeText object and will be freed when
 | 
| +// the object is destroyed. This UnicodeText object is referred to
 | 
| +// as an "owner."
 | 
| +//
 | 
| +// If do_copy is false, then no copy is made. The resulting
 | 
| +// UnicodeText object does NOT take ownership of the string; in this
 | 
| +// case, the lifetime of the UnicodeText object must not exceed the
 | 
| +// lifetime of the string. This Unicodetext object is referred to as
 | 
| +// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
 | 
| +//
 | 
| +// If the input string does not contain valid UTF-8, then a copy is
 | 
| +// made (as if do_copy were true) and coerced to valid UTF-8 by
 | 
| +// replacing each invalid byte with a space.
 | 
| +//
 | 
| +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
 | 
| +                                     bool do_copy) {
 | 
| +  UnicodeText t;
 | 
| +  if (do_copy) {
 | 
| +    t.CopyUTF8(utf8_buf, len);
 | 
| +  } else {
 | 
| +    t.PointToUTF8(utf8_buf, len);
 | 
| +  }
 | 
| +  return t;
 | 
| +}
 | 
| +
 | 
| +inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
 | 
| +  return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
 | 
| +}
 | 
| +
 | 
| +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
 | 
| +  return UTF8ToUnicodeText(utf8_buf, len, true);
 | 
| +}
 | 
| +inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
 | 
| +  return UTF8ToUnicodeText(utf8_string, true);
 | 
| +}
 | 
| +
 | 
| +// Return a string containing the UTF-8 encoded version of all the
 | 
| +// Unicode characters in t.
 | 
| +inline string UnicodeTextToUTF8(const UnicodeText& t) {
 | 
| +  return string(t.utf8_data(), t.utf8_length());
 | 
| +}
 | 
| +
 | 
| +#endif  // UTIL_UTF8_UNICODETEXT_H__
 | 
| 
 | 
| Property changes on: third_party\libphonenumber\cpp\src\utf\unicodetext.h
 | 
| ___________________________________________________________________
 | 
| Added: svn:eol-style
 | 
|    + LF
 | 
| 
 | 
| 
 |