OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 #ifndef UNICODE_H_ | |
6 #define UNICODE_H_ | |
7 | |
8 #include "struct.h" | |
9 | |
10 class Utf { | |
11 public: | |
12 static const int32_t kMaxCodePoint = 0x10FFFF; | |
13 | |
14 static bool IsLatin1(int32_t code_point) { | |
15 return (code_point >= 0) && (code_point <= 0xFF); | |
16 } | |
17 | |
18 static bool IsBmp(int32_t code_point) { | |
19 return (code_point >= 0) && (code_point <= 0xFFFF); | |
20 } | |
21 | |
22 static bool IsSupplementary(int32_t code_point) { | |
23 return (code_point > 0xFFFF) && (code_point <= kMaxCodePoint); | |
24 } | |
25 | |
26 // Returns true if the code point value is above Plane 17. | |
27 static bool IsOutOfRange(intptr_t code_point) { | |
28 return (code_point < 0) || (code_point > kMaxCodePoint); | |
29 } | |
30 }; | |
31 | |
32 class Utf8 { | |
33 public: | |
34 enum Type { | |
35 kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF]. | |
36 kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF]. | |
37 kSupplementary, // Supplementary code point [U+010000, U+10FFFF]. | |
38 }; | |
39 | |
40 // Returns the most restricted coding form in which the sequence of utf8 | |
41 // characters in 'utf8_array' can be represented in, and the number of | |
42 // code units needed in that form. | |
43 static intptr_t CodeUnitCount(const char* utf8_array, | |
44 intptr_t array_len, | |
45 Type* type); | |
46 | |
47 static intptr_t Length(int32_t ch); | |
48 static intptr_t Length(List<uint16_t> str); | |
49 | |
50 static intptr_t Encode(int32_t ch, char* dst); | |
51 static intptr_t Encode(List<uint16_t> str, char* dst, intptr_t len); | |
52 | |
53 static intptr_t Decode(const char* utf8_array, | |
54 intptr_t array_len, | |
55 int32_t* ch); | |
56 static bool DecodeToUTF16(const char* utf8_array, | |
57 intptr_t array_len, | |
58 uint16_t* dst, | |
59 intptr_t len); | |
60 | |
61 static const int32_t kMaxOneByteChar = 0x7F; | |
62 static const int32_t kMaxTwoByteChar = 0x7FF; | |
63 static const int32_t kMaxThreeByteChar = 0xFFFF; | |
64 static const int32_t kMaxFourByteChar = Utf::kMaxCodePoint; | |
65 | |
66 private: | |
67 static bool IsTrailByte(uint8_t code_unit) { | |
68 return (code_unit & 0xC0) == 0x80; | |
69 } | |
70 | |
71 static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) { | |
72 return code_point < kOverlongMinimum[num_code_units]; | |
73 } | |
74 | |
75 static bool IsLatin1SequenceStart(uint8_t code_unit) { | |
76 // Check if utf8 sequence is the start of a codepoint <= U+00FF | |
77 return (code_unit <= 0xC3); | |
78 } | |
79 | |
80 static bool IsSupplementarySequenceStart(uint8_t code_unit) { | |
81 // Check if utf8 sequence is the start of a codepoint >= U+10000. | |
82 return (code_unit >= 0xF0); | |
83 } | |
84 | |
85 static const int8_t kTrailBytes[]; | |
86 static const uint32_t kMagicBits[]; | |
87 static const uint32_t kOverlongMinimum[]; | |
88 }; | |
89 | |
90 class Utf16 { | |
91 public: | |
92 // Returns the length of the code point in UTF-16 code units. | |
93 static intptr_t Length(int32_t ch) { | |
94 return (ch <= Utf16::kMaxCodeUnit) ? 1 : 2; | |
95 } | |
96 | |
97 // Returns true if ch is a lead or trail surrogate. | |
98 static bool IsSurrogate(int32_t ch) { | |
99 return (ch & 0xFFFFF800) == 0xD800; | |
100 } | |
101 | |
102 // Returns true if ch is a lead surrogate. | |
103 static bool IsLeadSurrogate(int32_t ch) { | |
104 return (ch & 0xFFFFFC00) == 0xD800; | |
105 } | |
106 | |
107 // Returns true if ch is a low surrogate. | |
108 static bool IsTrailSurrogate(int32_t ch) { | |
109 return (ch & 0xFFFFFC00) == 0xDC00; | |
110 } | |
111 | |
112 // Returns the character at i and advances i to the next character | |
113 // boundary. | |
114 static int32_t Next(const uint16_t* characters, intptr_t* i, intptr_t len) { | |
115 int32_t ch = characters[*i]; | |
116 if (Utf16::IsLeadSurrogate(ch) && (*i < (len - 1))) { | |
117 int32_t ch2 = characters[*i + 1]; | |
118 if (Utf16::IsTrailSurrogate(ch2)) { | |
119 ch = Utf16::Decode(ch, ch2); | |
120 *i += 1; | |
121 } | |
122 } | |
123 *i += 1; | |
124 return ch; | |
125 } | |
126 | |
127 // Decodes a surrogate pair into a supplementary code point. | |
128 static int32_t Decode(int32_t lead, int32_t trail) { | |
129 return 0x10000 + ((lead & 0x3FF) << 10) + (trail & 0x3FF); | |
130 } | |
131 | |
132 // Encodes a single code point. | |
133 static void Encode(int32_t codepoint, uint16_t* dst); | |
134 | |
135 static const int32_t kMaxCodeUnit = 0xFFFF; | |
136 | |
137 private: | |
138 static const int32_t kLeadSurrogateOffset = (0xD800 - (0x10000 >> 10)); | |
139 | |
140 static const int32_t kSurrogateOffset = (0x10000 - (0xD800 << 10) - 0xDC00); | |
141 }; | |
142 | |
143 #endif // UNICODE_H_ | |
OLD | NEW |