OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of utf; | |
6 | |
7 const int _UTF8_ONE_BYTE_MAX = 0x7f; | |
8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | |
9 const int _UTF8_THREE_BYTE_MAX = 0xffff; | |
10 | |
11 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; | |
12 | |
13 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; | |
14 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; | |
15 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; | |
16 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; | |
17 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; | |
18 | |
19 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; | |
20 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; | |
21 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; | |
22 | |
23 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; | |
24 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; | |
25 | |
26 /** | |
27 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert | |
28 * as much of the input as needed. Set the replacementCharacter to null to | |
29 * throw an ArgumentError rather than replace the bad value. | |
30 */ | |
31 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, | |
32 int length, | |
33 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
34 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); | |
35 } | |
36 | |
37 /** | |
38 * Produce a String from a List of UTF-8 encoded bytes. The parameters | |
39 * can set an offset into a list of bytes (as int), limit the length of the | |
40 * values to be decoded, and override the default Unicode replacement character. | |
41 * Set the replacementCharacter to null to throw an ArgumentError | |
42 * rather than replace the bad value. | |
43 */ | |
44 String decodeUtf8(List<int> bytes, [int offset = 0, int length, | |
45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
46 return new String.fromCharCodes( | |
47 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) | |
48 .decodeRest()); | |
49 } | |
50 | |
51 /** | |
52 * Produce a sequence of UTF-8 encoded bytes from the provided string. | |
53 */ | |
54 List<int> encodeUtf8(String str) => | |
55 codepointsToUtf8(stringToCodepoints(str)); | |
56 | |
57 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { | |
58 while (bytes > 0) { | |
59 buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE | | |
60 (value & _UTF8_LO_SIX_BIT_MASK); | |
61 value = value >> 6; | |
62 bytes--; | |
63 } | |
64 return value; | |
65 } | |
66 | |
67 /** | |
68 * Encode code points as UTF-8 code units. | |
69 */ | |
70 List<int> codepointsToUtf8( | |
71 List<int> codepoints, [int offset = 0, int length]) { | |
72 _ListRange source = new _ListRange(codepoints, offset, length); | |
73 | |
74 int encodedLength = 0; | |
75 for (int value in source) { | |
76 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | |
77 encodedLength += 3; | |
78 } else if (value <= _UTF8_ONE_BYTE_MAX) { | |
79 encodedLength++; | |
80 } else if (value <= _UTF8_TWO_BYTE_MAX) { | |
81 encodedLength += 2; | |
82 } else if (value <= _UTF8_THREE_BYTE_MAX) { | |
83 encodedLength += 3; | |
84 } else if (value <= UNICODE_VALID_RANGE_MAX) { | |
85 encodedLength += 4; | |
86 } | |
87 } | |
88 | |
89 List<int> encoded = new List<int>(encodedLength); | |
90 int insertAt = 0; | |
91 for (int value in source) { | |
92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | |
93 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); | |
94 insertAt += 3; | |
95 } else if (value <= _UTF8_ONE_BYTE_MAX) { | |
96 encoded[insertAt] = value; | |
97 insertAt++; | |
98 } else if (value <= _UTF8_TWO_BYTE_MAX) { | |
99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( | |
100 _UTF8_FIRST_BYTE_OF_TWO_MASK & | |
101 _addToEncoding(insertAt, 1, value, encoded)); | |
102 insertAt += 2; | |
103 } else if (value <= _UTF8_THREE_BYTE_MAX) { | |
104 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | ( | |
105 _UTF8_FIRST_BYTE_OF_THREE_MASK & | |
106 _addToEncoding(insertAt, 2, value, encoded)); | |
107 insertAt += 3; | |
108 } else if (value <= UNICODE_VALID_RANGE_MAX) { | |
109 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | ( | |
110 _UTF8_FIRST_BYTE_OF_FOUR_MASK & | |
111 _addToEncoding(insertAt, 3, value, encoded)); | |
112 insertAt += 4; | |
113 } | |
114 } | |
115 return encoded; | |
116 } | |
117 | |
118 // Because UTF-8 specifies byte order, we do not have to follow the pattern | |
119 // used by UTF-16 & UTF-32 regarding byte order. | |
120 List<int> utf8ToCodepoints( | |
121 List<int> utf8EncodedBytes, [int offset = 0, int length, | |
122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
123 return new Utf8Decoder(utf8EncodedBytes, offset, length, | |
124 replacementCodepoint).decodeRest(); | |
125 } | |
126 | |
127 /** | |
128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type | |
129 * provides an iterator on demand and the iterator will only translate bytes | |
130 * as requested by the user of the iterator. (Note: results are not cached.) | |
131 */ | |
132 // TODO(floitsch): Consider removing the extend and switch to implements since | |
133 // that's cheaper to allocate. | |
134 class IterableUtf8Decoder extends IterableBase<int> { | |
135 final List<int> bytes; | |
136 final int offset; | |
137 final int length; | |
138 final int replacementCodepoint; | |
139 | |
140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, | |
141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | |
142 | |
143 Utf8Decoder get iterator => | |
144 new Utf8Decoder(bytes, offset, length, replacementCodepoint); | |
145 } | |
146 | |
147 /** | |
148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | |
149 * parameters can set an offset into a list of bytes (as int), limit the length | |
150 * of the values to be decoded, and override the default Unicode replacement | |
151 * character. Set the replacementCharacter to null to throw an | |
152 * ArgumentError rather than replace the bad value. The return value | |
153 * from this method can be used as an Iterable (e.g. in a for-loop). | |
154 */ | |
155 class Utf8Decoder implements Iterator<int> { | |
156 final _ListRangeIterator utf8EncodedBytesIterator; | |
157 final int replacementCodepoint; | |
158 int _current = null; | |
159 | |
160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | |
161 this.replacementCodepoint = | |
162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
163 utf8EncodedBytesIterator = | |
164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator; | |
165 | |
166 | |
167 Utf8Decoder._fromListRangeIterator(_ListRange source, [ | |
168 this.replacementCodepoint = | |
169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
170 utf8EncodedBytesIterator = source.iterator; | |
171 | |
172 /** Decode the remaininder of the characters in this decoder | |
173 * into a [List<int>]. | |
174 */ | |
175 List<int> decodeRest() { | |
176 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); | |
177 int i = 0; | |
178 while (moveNext()) { | |
179 codepoints[i++] = current; | |
180 } | |
181 if (i == codepoints.length) { | |
182 return codepoints; | |
183 } else { | |
184 List<int> truncCodepoints = new List<int>(i); | |
185 truncCodepoints.setRange(0, i, codepoints); | |
186 return truncCodepoints; | |
187 } | |
188 } | |
189 | |
190 int get current => _current; | |
191 | |
192 bool moveNext() { | |
193 _current = null; | |
194 | |
195 if (!utf8EncodedBytesIterator.moveNext()) return false; | |
196 | |
197 int value = utf8EncodedBytesIterator.current; | |
198 int additionalBytes = 0; | |
199 | |
200 if (value < 0) { | |
201 if (replacementCodepoint != null) { | |
202 _current = replacementCodepoint; | |
203 return true; | |
204 } else { | |
205 throw new ArgumentError( | |
206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | |
207 } | |
208 } else if (value <= _UTF8_ONE_BYTE_MAX) { | |
209 _current = value; | |
210 return true; | |
211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | |
212 if (replacementCodepoint != null) { | |
213 _current = replacementCodepoint; | |
214 return true; | |
215 } else { | |
216 throw new ArgumentError( | |
217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | |
218 } | |
219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | |
220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | |
221 additionalBytes = 1; | |
222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | |
223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | |
224 additionalBytes = 2; | |
225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | |
226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | |
227 additionalBytes = 3; | |
228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | |
229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | |
230 additionalBytes = 4; | |
231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | |
232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | |
233 additionalBytes = 5; | |
234 } else if (replacementCodepoint != null) { | |
235 _current = replacementCodepoint; | |
236 return true; | |
237 } else { | |
238 throw new ArgumentError( | |
239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | |
240 } | |
241 int j = 0; | |
242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { | |
243 int nextValue = utf8EncodedBytesIterator.current; | |
244 if (nextValue > _UTF8_ONE_BYTE_MAX && | |
245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | |
246 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | |
247 } else { | |
248 // if sequence-starting code unit, reposition cursor to start here | |
249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { | |
250 utf8EncodedBytesIterator.backup(); | |
251 } | |
252 break; | |
253 } | |
254 j++; | |
255 } | |
256 bool validSequence = (j == additionalBytes && ( | |
257 value < UNICODE_UTF16_RESERVED_LO || | |
258 value > UNICODE_UTF16_RESERVED_HI)); | |
259 bool nonOverlong = | |
260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | |
261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | |
262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | |
263 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | |
264 if (validSequence && nonOverlong && inRange) { | |
265 _current = value; | |
266 return true; | |
267 } else if (replacementCodepoint != null) { | |
268 _current = replacementCodepoint; | |
269 return true; | |
270 } else { | |
271 throw new ArgumentError( | |
272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | |
273 } | |
274 } | |
275 } | |
OLD | NEW |