OLD | NEW |
(Empty) | |
| 1 /** |
| 2 * Copyright 2010 Google Inc. |
| 3 * |
| 4 * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 * you may not use this file except in compliance with the License. |
| 6 * You may obtain a copy of the License at |
| 7 * |
| 8 * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 * |
| 10 * Unless required by applicable law or agreed to in writing, software |
| 11 * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 * See the License for the specific language governing permissions and |
| 14 * limitations under the License. |
| 15 */ |
| 16 |
| 17 // Routines to do manipulation of Unicode characters or text |
| 18 // |
| 19 // The StructurallyValid routines accept buffers of arbitrary bytes. |
| 20 // For CoerceToStructurallyValid(), the input buffer and output buffers may |
| 21 // point to exactly the same memory. |
| 22 // |
| 23 // In all other cases, the UTF-8 string must be structurally valid and |
| 24 // have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. |
| 25 // Debug builds take a fatal error for invalid UTF-8 input. |
| 26 // The input and output buffers may not overlap at all. |
| 27 // |
| 28 // The char32 routines are here only for convenience; they convert to UTF-8 |
| 29 // internally and use the UTF-8 routines. |
| 30 |
| 31 #ifndef UTIL_UTF8_UNILIB_H__ |
| 32 #define UTIL_UTF8_UNILIB_H__ |
| 33 |
| 34 #include <string> |
| 35 #include "base/basictypes.h" |
| 36 |
| 37 namespace UniLib { |
| 38 |
| 39 // Returns true unless a surrogate code point |
| 40 inline bool IsValidCodepoint(char32 c) { |
| 41 // In the range [0, 0xD800) or [0xE000, 0x10FFFF] |
| 42 return (static_cast<uint32>(c) < 0xD800) |
| 43 || (c >= 0xE000 && c <= 0x10FFFF); |
| 44 } |
| 45 |
| 46 // Table of UTF-8 character lengths, based on first byte |
| 47 static const unsigned char kUTF8LenTbl[256] = { |
| 48 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 49 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 50 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 51 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 52 |
| 53 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 54 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 55 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 56 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 |
| 57 }; |
| 58 |
| 59 // Return length of a single UTF-8 source character |
| 60 inline int OneCharLen(const char* src) { |
| 61 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; |
| 62 } |
| 63 |
| 64 // Return length of a single UTF-8 source character |
| 65 inline int OneCharLen(const uint8* src) { |
| 66 return kUTF8LenTbl[*src]; |
| 67 } |
| 68 |
| 69 // Return true if this byte is a trailing UTF-8 byte (10xx xxxx) |
| 70 inline bool IsTrailByte(char x) { |
| 71 // return (x & 0xC0) == 0x80; |
| 72 // Since trail bytes are always in [0x80, 0xBF], we can optimize: |
| 73 return static_cast<signed char>(x) < -0x40; |
| 74 } |
| 75 |
| 76 // Returns the length in bytes of the prefix of src that is all |
| 77 // interchange valid UTF-8 |
| 78 int SpanInterchangeValid(const char* src, int byte_length); |
| 79 inline int SpanInterchangeValid(const std::string& src) { |
| 80 return SpanInterchangeValid(src.data(), src.size()); |
| 81 } |
| 82 |
| 83 // Returns true if the source is all interchange valid UTF-8 |
| 84 // "Interchange valid" is a stronger than structurally valid -- |
| 85 // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. |
| 86 inline bool IsInterchangeValid(const char* src, int byte_length) { |
| 87 return (byte_length == SpanInterchangeValid(src, byte_length)); |
| 88 } |
| 89 inline bool IsInterchangeValid(const std::string& src) { |
| 90 return IsInterchangeValid(src.data(), src.size()); |
| 91 } |
| 92 |
| 93 } // namespace UniLib |
| 94 |
| 95 #endif // UTIL_UTF8_PUBLIC_UNILIB_H_ |
OLD | NEW |