OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 #include "unicode.h" | |
6 | |
7 const int8_t Utf8::kTrailBytes[256] = { | |
8 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
9 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
10 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
11 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
12 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
13 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
20 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
21 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
22 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
23 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | |
24 }; | |
25 | |
26 | |
27 const uint32_t Utf8::kMagicBits[7] = { | |
28 0, // Padding. | |
29 0x00000000, | |
30 0x00003080, | |
31 0x000E2080, | |
32 0x03C82080, | |
33 0xFA082080, | |
34 0x82082080 | |
35 }; | |
36 | |
37 | |
38 // Minimum values of code points used to check shortest form. | |
39 const uint32_t Utf8::kOverlongMinimum[7] = { | |
40 0, // Padding. | |
41 0x0, | |
42 0x80, | |
43 0x800, | |
44 0x10000, | |
45 0xFFFFFFFF, | |
46 0xFFFFFFFF | |
47 }; | |
48 | |
49 class CodePointIterator { | |
50 public: | |
51 explicit CodePointIterator(List<uint16_t> str) | |
52 : str_(str), | |
53 ch_(0), | |
54 index_(-1), | |
55 end_(str.length()) { | |
56 } | |
57 | |
58 int32_t Current() const { | |
59 return ch_; | |
60 } | |
61 | |
62 bool Next() { | |
63 intptr_t length = Utf16::Length(ch_); | |
64 if (index_ < (end_ - length)) { | |
65 index_ += length; | |
66 ch_ = str_[index_]; | |
67 if (Utf16::IsLeadSurrogate(ch_) && (index_ < (end_ - 1))) { | |
68 int32_t ch2 = str_[index_ + 1]; | |
69 if (Utf16::IsTrailSurrogate(ch2)) { | |
70 ch_ = Utf16::Decode(ch_, ch2); | |
71 } | |
72 } | |
73 return true; | |
74 } | |
75 index_ = end_; | |
76 return false; | |
77 } | |
78 | |
79 private: | |
80 List<uint16_t> str_; | |
81 int32_t ch_; | |
82 intptr_t index_; | |
83 intptr_t end_; | |
84 }; | |
85 | |
86 // Returns the most restricted coding form in which the sequence of utf8 | |
87 // characters in 'utf8_array' can be represented in, and the number of | |
88 // code units needed in that form. | |
89 intptr_t Utf8::CodeUnitCount(const char* utf8_array, | |
90 intptr_t array_len, | |
91 Type* type) { | |
92 intptr_t len = 0; | |
93 Type char_type = kLatin1; | |
94 for (intptr_t i = 0; i < array_len; i++) { | |
95 uint8_t code_unit = utf8_array[i]; | |
96 if (!IsTrailByte(code_unit)) { | |
97 ++len; | |
98 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | |
99 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | |
100 char_type = kSupplementary; | |
101 ++len; | |
102 } else if (char_type == kLatin1) { | |
103 char_type = kBMP; | |
104 } | |
105 } | |
106 } | |
107 } | |
108 *type = char_type; | |
109 return len; | |
110 } | |
111 | |
112 intptr_t Utf8::Length(int32_t ch) { | |
113 if (ch <= kMaxOneByteChar) { | |
114 return 1; | |
115 } else if (ch <= kMaxTwoByteChar) { | |
116 return 2; | |
117 } else if (ch <= kMaxThreeByteChar) { | |
118 return 3; | |
119 } | |
120 return 4; | |
121 } | |
122 | |
123 intptr_t Utf8::Length(List<uint16_t> str) { | |
124 intptr_t length = 0; | |
125 CodePointIterator it(str); | |
126 while (it.Next()) { | |
127 int32_t ch = it.Current(); | |
128 length += Utf8::Length(ch); | |
129 } | |
130 return length; | |
131 } | |
132 | |
133 intptr_t Utf8::Encode(int32_t ch, char* dst) { | |
134 static const int kMask = ~(1 << 6); | |
135 if (ch <= kMaxOneByteChar) { | |
136 dst[0] = ch; | |
137 return 1; | |
138 } | |
139 if (ch <= kMaxTwoByteChar) { | |
140 dst[0] = 0xC0 | (ch >> 6); | |
141 dst[1] = 0x80 | (ch & kMask); | |
142 return 2; | |
143 } | |
144 if (ch <= kMaxThreeByteChar) { | |
145 dst[0] = 0xE0 | (ch >> 12); | |
146 dst[1] = 0x80 | ((ch >> 6) & kMask); | |
147 dst[2] = 0x80 | (ch & kMask); | |
148 return 3; | |
149 } | |
150 dst[0] = 0xF0 | (ch >> 18); | |
151 dst[1] = 0x80 | ((ch >> 12) & kMask); | |
152 dst[2] = 0x80 | ((ch >> 6) & kMask); | |
153 dst[3] = 0x80 | (ch & kMask); | |
154 return 4; | |
155 } | |
156 | |
157 intptr_t Utf8::Encode(List<uint16_t> src, char* dst, intptr_t len) { | |
158 intptr_t pos = 0; | |
159 CodePointIterator it(src); | |
160 while (it.Next()) { | |
161 int32_t ch = it.Current(); | |
162 intptr_t num_bytes = Utf8::Length(ch); | |
163 if (pos + num_bytes > len) { | |
164 break; | |
165 } | |
166 Utf8::Encode(ch, &dst[pos]); | |
167 pos += num_bytes; | |
168 } | |
169 return pos; | |
170 } | |
171 | |
172 intptr_t Utf8::Decode(const char* utf8_array, | |
173 intptr_t array_len, | |
174 int32_t* dst) { | |
175 uint32_t ch = utf8_array[0] & 0xFF; | |
176 intptr_t i = 1; | |
177 if (ch >= 0x80) { | |
178 intptr_t num_trail_bytes = kTrailBytes[ch]; | |
179 bool is_malformed = false; | |
180 for (; i < num_trail_bytes; ++i) { | |
181 if (i < array_len) { | |
182 uint8_t code_unit = utf8_array[i]; | |
183 is_malformed |= !IsTrailByte(code_unit); | |
184 ch = (ch << 6) + code_unit; | |
185 } else { | |
186 *dst = -1; | |
187 return 0; | |
188 } | |
189 } | |
190 ch -= kMagicBits[num_trail_bytes]; | |
191 if (!((is_malformed == false) && | |
192 (i == num_trail_bytes) && | |
193 !Utf::IsOutOfRange(ch) && | |
194 !IsNonShortestForm(ch, i) && | |
195 !Utf16::IsSurrogate(ch))) { | |
196 *dst = -1; | |
197 return 0; | |
198 } | |
199 } | |
200 *dst = ch; | |
201 return i; | |
202 } | |
203 | |
204 bool Utf8::DecodeToUTF16(const char* utf8_array, | |
205 intptr_t array_len, | |
206 uint16_t* dst, | |
207 intptr_t len) { | |
208 intptr_t i = 0; | |
209 intptr_t j = 0; | |
210 intptr_t num_bytes; | |
211 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | |
212 int32_t ch; | |
213 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); | |
214 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | |
215 if (ch == -1) { | |
216 return false; // Invalid input. | |
217 } | |
218 if (is_supplementary) { | |
219 Utf16::Encode(ch, &dst[j]); | |
220 j = j + 1; | |
221 } else { | |
222 dst[j] = ch; | |
223 } | |
224 } | |
225 if ((i < array_len) && (j == len)) { | |
226 return false; // Output overflow. | |
227 } | |
228 return true; // Success. | |
229 } | |
230 | |
231 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | |
232 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | |
233 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | |
234 } | |
OLD | NEW |