OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of utf; | 5 library utf.utf8; |
| 6 |
| 7 import "dart:collection"; |
| 8 |
| 9 import 'constants.dart'; |
| 10 import 'list_range.dart'; |
| 11 import 'shared.dart'; |
6 | 12 |
7 const int _UTF8_ONE_BYTE_MAX = 0x7f; | 13 const int _UTF8_ONE_BYTE_MAX = 0x7f; |
8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | 14 const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
9 const int _UTF8_THREE_BYTE_MAX = 0xffff; | 15 const int _UTF8_THREE_BYTE_MAX = 0xffff; |
10 | 16 |
11 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; | 17 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; |
12 | 18 |
13 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; | 19 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; |
14 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; | 20 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; |
15 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; | 21 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; |
16 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; | 22 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; |
17 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; | 23 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; |
18 | 24 |
19 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; | 25 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; |
20 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; | 26 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; |
21 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; | 27 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; |
22 | 28 |
23 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; | 29 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; |
24 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; | 30 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; |
25 | 31 |
26 /** | 32 /** |
27 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert | 33 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert |
28 * as much of the input as needed. Set the replacementCharacter to null to | 34 * as much of the input as needed. Set the replacementCharacter to null to |
29 * throw an ArgumentError rather than replace the bad value. | 35 * throw an ArgumentError rather than replace the bad value. |
30 */ | 36 */ |
31 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, | 37 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, |
| 38 [int offset = 0, |
32 int length, | 39 int length, |
33 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 40 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
34 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); | 41 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); |
35 } | 42 } |
36 | 43 |
37 /** | 44 /** |
38 * Produce a String from a List of UTF-8 encoded bytes. The parameters | 45 * Produce a String from a List of UTF-8 encoded bytes. The parameters |
39 * can set an offset into a list of bytes (as int), limit the length of the | 46 * can set an offset into a list of bytes (as int), limit the length of the |
40 * values to be decoded, and override the default Unicode replacement character. | 47 * values to be decoded, and override the default Unicode replacement character. |
41 * Set the replacementCharacter to null to throw an ArgumentError | 48 * Set the replacementCharacter to null to throw an ArgumentError |
42 * rather than replace the bad value. | 49 * rather than replace the bad value. |
43 */ | 50 */ |
44 String decodeUtf8(List<int> bytes, [int offset = 0, int length, | 51 String decodeUtf8(List<int> bytes, |
| 52 [int offset = 0, |
| 53 int length, |
45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 54 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
46 return new String.fromCharCodes( | 55 return new String.fromCharCodes( |
47 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) | 56 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) |
48 .decodeRest()); | 57 .decodeRest()); |
49 } | 58 } |
50 | 59 |
51 /** | 60 /** |
52 * Produce a sequence of UTF-8 encoded bytes from the provided string. | 61 * Produce a sequence of UTF-8 encoded bytes from the provided string. |
53 */ | 62 */ |
54 List<int> encodeUtf8(String str) => | 63 List<int> encodeUtf8(String str) => codepointsToUtf8(stringToCodepoints(str)); |
55 codepointsToUtf8(stringToCodepoints(str)); | |
56 | 64 |
57 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { | 65 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { |
58 while (bytes > 0) { | 66 while (bytes > 0) { |
59 buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE | | 67 buffer[offset + bytes] = |
60 (value & _UTF8_LO_SIX_BIT_MASK); | 68 _UTF8_SUBSEQUENT_BYTE_BASE | (value & _UTF8_LO_SIX_BIT_MASK); |
61 value = value >> 6; | 69 value = value >> 6; |
62 bytes--; | 70 bytes--; |
63 } | 71 } |
64 return value; | 72 return value; |
65 } | 73 } |
66 | 74 |
67 /** | 75 /** |
68 * Encode code points as UTF-8 code units. | 76 * Encode code points as UTF-8 code units. |
69 */ | 77 */ |
70 List<int> codepointsToUtf8( | 78 List<int> codepointsToUtf8(List<int> codepoints, [int offset = 0, int length]) { |
71 List<int> codepoints, [int offset = 0, int length]) { | |
72 ListRange source = new ListRange(codepoints, offset, length); | 79 ListRange source = new ListRange(codepoints, offset, length); |
73 | 80 |
74 int encodedLength = 0; | 81 int encodedLength = 0; |
75 for (int value in source) { | 82 for (int value in source) { |
76 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | 83 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
77 encodedLength += 3; | 84 encodedLength += 3; |
78 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 85 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
79 encodedLength++; | 86 encodedLength++; |
80 } else if (value <= _UTF8_TWO_BYTE_MAX) { | 87 } else if (value <= _UTF8_TWO_BYTE_MAX) { |
81 encodedLength += 2; | 88 encodedLength += 2; |
82 } else if (value <= _UTF8_THREE_BYTE_MAX) { | 89 } else if (value <= _UTF8_THREE_BYTE_MAX) { |
83 encodedLength += 3; | 90 encodedLength += 3; |
84 } else if (value <= UNICODE_VALID_RANGE_MAX) { | 91 } else if (value <= UNICODE_VALID_RANGE_MAX) { |
85 encodedLength += 4; | 92 encodedLength += 4; |
86 } | 93 } |
87 } | 94 } |
88 | 95 |
89 List<int> encoded = new List<int>(encodedLength); | 96 List<int> encoded = new List<int>(encodedLength); |
90 int insertAt = 0; | 97 int insertAt = 0; |
91 for (int value in source) { | 98 for (int value in source) { |
92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | 99 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
93 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); | 100 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); |
94 insertAt += 3; | 101 insertAt += 3; |
95 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 102 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
96 encoded[insertAt] = value; | 103 encoded[insertAt] = value; |
97 insertAt++; | 104 insertAt++; |
98 } else if (value <= _UTF8_TWO_BYTE_MAX) { | 105 } else if (value <= _UTF8_TWO_BYTE_MAX) { |
99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( | 106 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | |
100 _UTF8_FIRST_BYTE_OF_TWO_MASK & | 107 (_UTF8_FIRST_BYTE_OF_TWO_MASK & |
101 _addToEncoding(insertAt, 1, value, encoded)); | 108 _addToEncoding(insertAt, 1, value, encoded)); |
102 insertAt += 2; | 109 insertAt += 2; |
103 } else if (value <= _UTF8_THREE_BYTE_MAX) { | 110 } else if (value <= _UTF8_THREE_BYTE_MAX) { |
104 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | ( | 111 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | |
105 _UTF8_FIRST_BYTE_OF_THREE_MASK & | 112 (_UTF8_FIRST_BYTE_OF_THREE_MASK & |
106 _addToEncoding(insertAt, 2, value, encoded)); | 113 _addToEncoding(insertAt, 2, value, encoded)); |
107 insertAt += 3; | 114 insertAt += 3; |
108 } else if (value <= UNICODE_VALID_RANGE_MAX) { | 115 } else if (value <= UNICODE_VALID_RANGE_MAX) { |
109 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | ( | 116 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | |
110 _UTF8_FIRST_BYTE_OF_FOUR_MASK & | 117 (_UTF8_FIRST_BYTE_OF_FOUR_MASK & |
111 _addToEncoding(insertAt, 3, value, encoded)); | 118 _addToEncoding(insertAt, 3, value, encoded)); |
112 insertAt += 4; | 119 insertAt += 4; |
113 } | 120 } |
114 } | 121 } |
115 return encoded; | 122 return encoded; |
116 } | 123 } |
117 | 124 |
118 // Because UTF-8 specifies byte order, we do not have to follow the pattern | 125 // Because UTF-8 specifies byte order, we do not have to follow the pattern |
119 // used by UTF-16 & UTF-32 regarding byte order. | 126 // used by UTF-16 & UTF-32 regarding byte order. |
120 List<int> utf8ToCodepoints( | 127 List<int> utf8ToCodepoints(List<int> utf8EncodedBytes, |
121 List<int> utf8EncodedBytes, [int offset = 0, int length, | 128 [int offset = 0, |
| 129 int length, |
122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 130 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
123 return new Utf8Decoder(utf8EncodedBytes, offset, length, | 131 return new Utf8Decoder(utf8EncodedBytes, offset, length, replacementCodepoint) |
124 replacementCodepoint).decodeRest(); | 132 .decodeRest(); |
125 } | 133 } |
126 | 134 |
127 /** | 135 /** |
128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type | 136 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type |
129 * provides an iterator on demand and the iterator will only translate bytes | 137 * provides an iterator on demand and the iterator will only translate bytes |
130 * as requested by the user of the iterator. (Note: results are not cached.) | 138 * as requested by the user of the iterator. (Note: results are not cached.) |
131 */ | 139 */ |
132 // TODO(floitsch): Consider removing the extend and switch to implements since | 140 // TODO(floitsch): Consider removing the extend and switch to implements since |
133 // that's cheaper to allocate. | 141 // that's cheaper to allocate. |
134 class IterableUtf8Decoder extends IterableBase<int> { | 142 class IterableUtf8Decoder extends IterableBase<int> { |
135 final List<int> bytes; | 143 final List<int> bytes; |
136 final int offset; | 144 final int offset; |
137 final int length; | 145 final int length; |
138 final int replacementCodepoint; | 146 final int replacementCodepoint; |
139 | 147 |
140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, | 148 IterableUtf8Decoder(this.bytes, |
| 149 [this.offset = 0, |
| 150 this.length = null, |
141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | 151 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
142 | 152 |
143 Utf8Decoder get iterator => | 153 Utf8Decoder get iterator => |
144 new Utf8Decoder(bytes, offset, length, replacementCodepoint); | 154 new Utf8Decoder(bytes, offset, length, replacementCodepoint); |
145 } | 155 } |
146 | 156 |
147 /** | 157 /** |
148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | 158 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
149 * parameters can set an offset into a list of bytes (as int), limit the length | 159 * parameters can set an offset into a list of bytes (as int), limit the length |
150 * of the values to be decoded, and override the default Unicode replacement | 160 * of the values to be decoded, and override the default Unicode replacement |
151 * character. Set the replacementCharacter to null to throw an | 161 * character. Set the replacementCharacter to null to throw an |
152 * ArgumentError rather than replace the bad value. The return value | 162 * ArgumentError rather than replace the bad value. The return value |
153 * from this method can be used as an Iterable (e.g. in a for-loop). | 163 * from this method can be used as an Iterable (e.g. in a for-loop). |
154 */ | 164 */ |
155 class Utf8Decoder implements Iterator<int> { | 165 class Utf8Decoder implements Iterator<int> { |
156 // TODO(kevmoo): should this field be private? | 166 // TODO(kevmoo): should this field be private? |
157 final ListRangeIterator utf8EncodedBytesIterator; | 167 final ListRangeIterator utf8EncodedBytesIterator; |
158 final int replacementCodepoint; | 168 final int replacementCodepoint; |
159 int _current = null; | 169 int _current = null; |
160 | 170 |
161 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | 171 Utf8Decoder(List<int> utf8EncodedBytes, |
162 this.replacementCodepoint = | 172 [int offset = 0, |
163 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 173 int length, |
164 utf8EncodedBytesIterator = | 174 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
165 (new ListRange(utf8EncodedBytes, offset, length)).iterator; | 175 : utf8EncodedBytesIterator = |
| 176 (new ListRange(utf8EncodedBytes, offset, length)).iterator; |
166 | 177 |
167 | 178 Utf8Decoder._fromListRangeIterator(ListRange source, |
168 Utf8Decoder._fromListRangeIterator(ListRange source, [ | 179 [this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
169 this.replacementCodepoint = | 180 : utf8EncodedBytesIterator = source.iterator; |
170 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
171 utf8EncodedBytesIterator = source.iterator; | |
172 | 181 |
173 /** Decode the remaininder of the characters in this decoder | 182 /** Decode the remaininder of the characters in this decoder |
174 * into a [List<int>]. | 183 * into a [List<int>]. |
175 */ | 184 */ |
176 List<int> decodeRest() { | 185 List<int> decodeRest() { |
177 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); | 186 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); |
178 int i = 0; | 187 int i = 0; |
179 while (moveNext()) { | 188 while (moveNext()) { |
180 codepoints[i++] = current; | 189 codepoints[i++] = current; |
181 } | 190 } |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
247 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | 256 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
248 } else { | 257 } else { |
249 // if sequence-starting code unit, reposition cursor to start here | 258 // if sequence-starting code unit, reposition cursor to start here |
250 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 259 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
251 utf8EncodedBytesIterator.backup(); | 260 utf8EncodedBytesIterator.backup(); |
252 } | 261 } |
253 break; | 262 break; |
254 } | 263 } |
255 j++; | 264 j++; |
256 } | 265 } |
257 bool validSequence = (j == additionalBytes && ( | 266 bool validSequence = (j == additionalBytes && |
258 value < UNICODE_UTF16_RESERVED_LO || | 267 (value < UNICODE_UTF16_RESERVED_LO || |
259 value > UNICODE_UTF16_RESERVED_HI)); | 268 value > UNICODE_UTF16_RESERVED_HI)); |
260 bool nonOverlong = | 269 bool nonOverlong = (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
261 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | |
262 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | 270 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
263 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | 271 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
264 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | 272 bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
265 if (validSequence && nonOverlong && inRange) { | 273 if (validSequence && nonOverlong && inRange) { |
266 _current = value; | 274 _current = value; |
267 return true; | 275 return true; |
268 } else if (replacementCodepoint != null) { | 276 } else if (replacementCodepoint != null) { |
269 _current = replacementCodepoint; | 277 _current = replacementCodepoint; |
270 return true; | 278 return true; |
271 } else { | 279 } else { |
272 throw new ArgumentError( | 280 throw new ArgumentError( |
273 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | 281 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
274 } | 282 } |
275 } | 283 } |
276 } | 284 } |
OLD | NEW |