OLD | NEW |
| (Empty) |
1 /** | |
2 * Copyright 2010 Google Inc. | |
3 * | |
4 * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 * you may not use this file except in compliance with the License. | |
6 * You may obtain a copy of the License at | |
7 * | |
8 * http://www.apache.org/licenses/LICENSE-2.0 | |
9 * | |
10 * Unless required by applicable law or agreed to in writing, software | |
11 * distributed under the License is distributed on an "AS IS" BASIS, | |
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 * See the License for the specific language governing permissions and | |
14 * limitations under the License. | |
15 */ | |
16 | |
17 // Routines to do manipulation of Unicode characters or text | |
18 // | |
19 // The StructurallyValid routines accept buffers of arbitrary bytes. | |
20 // For CoerceToStructurallyValid(), the input buffer and output buffers may | |
21 // point to exactly the same memory. | |
22 // | |
23 // In all other cases, the UTF-8 string must be structurally valid and | |
24 // have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. | |
25 // Debug builds take a fatal error for invalid UTF-8 input. | |
26 // The input and output buffers may not overlap at all. | |
27 // | |
28 // The char32 routines are here only for convenience; they convert to UTF-8 | |
29 // internally and use the UTF-8 routines. | |
30 | |
31 #ifndef UTIL_UTF8_UNILIB_H__ | |
32 #define UTIL_UTF8_UNILIB_H__ | |
33 | |
34 #include <string> | |
35 #include "base/basictypes.h" | |
36 | |
37 namespace UniLib { | |
38 | |
39 // Returns true unless a surrogate code point | |
40 inline bool IsValidCodepoint(char32 c) { | |
41 // In the range [0, 0xD800) or [0xE000, 0x10FFFF] | |
42 return (static_cast<uint32>(c) < 0xD800) | |
43 || (c >= 0xE000 && c <= 0x10FFFF); | |
44 } | |
45 | |
46 // Table of UTF-8 character lengths, based on first byte | |
47 static const unsigned char kUTF8LenTbl[256] = { | |
48 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, | |
49 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, | |
50 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, | |
51 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, | |
52 | |
53 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, | |
54 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, | |
55 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, | |
56 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 | |
57 }; | |
58 | |
59 // Return length of a single UTF-8 source character | |
60 inline int OneCharLen(const char* src) { | |
61 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; | |
62 } | |
63 | |
64 // Return length of a single UTF-8 source character | |
65 inline int OneCharLen(const uint8* src) { | |
66 return kUTF8LenTbl[*src]; | |
67 } | |
68 | |
69 // Return true if this byte is a trailing UTF-8 byte (10xx xxxx) | |
70 inline bool IsTrailByte(char x) { | |
71 // return (x & 0xC0) == 0x80; | |
72 // Since trail bytes are always in [0x80, 0xBF], we can optimize: | |
73 return static_cast<signed char>(x) < -0x40; | |
74 } | |
75 | |
76 // Returns the length in bytes of the prefix of src that is all | |
77 // interchange valid UTF-8 | |
78 int SpanInterchangeValid(const char* src, int byte_length); | |
79 inline int SpanInterchangeValid(const std::string& src) { | |
80 return SpanInterchangeValid(src.data(), src.size()); | |
81 } | |
82 | |
83 // Returns true if the source is all interchange valid UTF-8 | |
84 // "Interchange valid" is a stronger than structurally valid -- | |
85 // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. | |
86 inline bool IsInterchangeValid(const char* src, int byte_length) { | |
87 return (byte_length == SpanInterchangeValid(src, byte_length)); | |
88 } | |
89 inline bool IsInterchangeValid(const std::string& src) { | |
90 return IsInterchangeValid(src.data(), src.size()); | |
91 } | |
92 | |
93 } // namespace UniLib | |
94 | |
95 #endif // UTIL_UTF8_PUBLIC_UNILIB_H_ | |
OLD | NEW |