| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 part of utf; | |
| 6 | |
| 7 const int _UTF8_ONE_BYTE_MAX = 0x7f; | |
| 8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | |
| 9 const int _UTF8_THREE_BYTE_MAX = 0xffff; | |
| 10 | |
| 11 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; | |
| 12 | |
| 13 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; | |
| 14 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; | |
| 15 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; | |
| 16 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; | |
| 17 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; | |
| 18 | |
| 19 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; | |
| 20 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; | |
| 21 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; | |
| 22 | |
| 23 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; | |
| 24 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; | |
| 25 | |
| 26 /** | |
| 27 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert | |
| 28 * as much of the input as needed. Set the replacementCharacter to null to | |
| 29 * throw an ArgumentError rather than replace the bad value. | |
| 30 */ | |
| 31 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, | |
| 32 int length, | |
| 33 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 34 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); | |
| 35 } | |
| 36 | |
| 37 /** | |
| 38 * Produce a String from a List of UTF-8 encoded bytes. The parameters | |
| 39 * can set an offset into a list of bytes (as int), limit the length of the | |
| 40 * values to be decoded, and override the default Unicode replacement character. | |
| 41 * Set the replacementCharacter to null to throw an ArgumentError | |
| 42 * rather than replace the bad value. | |
| 43 */ | |
| 44 String decodeUtf8(List<int> bytes, [int offset = 0, int length, | |
| 45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 46 return new String.fromCharCodes( | |
| 47 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) | |
| 48 .decodeRest()); | |
| 49 } | |
| 50 | |
| 51 /** | |
| 52 * Produce a sequence of UTF-8 encoded bytes from the provided string. | |
| 53 */ | |
| 54 List<int> encodeUtf8(String str) => | |
| 55 codepointsToUtf8(stringToCodepoints(str)); | |
| 56 | |
| 57 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { | |
| 58 while (bytes > 0) { | |
| 59 buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE | | |
| 60 (value & _UTF8_LO_SIX_BIT_MASK); | |
| 61 value = value >> 6; | |
| 62 bytes--; | |
| 63 } | |
| 64 return value; | |
| 65 } | |
| 66 | |
| 67 /** | |
| 68 * Encode code points as UTF-8 code units. | |
| 69 */ | |
| 70 List<int> codepointsToUtf8( | |
| 71 List<int> codepoints, [int offset = 0, int length]) { | |
| 72 _ListRange source = new _ListRange(codepoints, offset, length); | |
| 73 | |
| 74 int encodedLength = 0; | |
| 75 for (int value in source) { | |
| 76 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | |
| 77 encodedLength += 3; | |
| 78 } else if (value <= _UTF8_ONE_BYTE_MAX) { | |
| 79 encodedLength++; | |
| 80 } else if (value <= _UTF8_TWO_BYTE_MAX) { | |
| 81 encodedLength += 2; | |
| 82 } else if (value <= _UTF8_THREE_BYTE_MAX) { | |
| 83 encodedLength += 3; | |
| 84 } else if (value <= UNICODE_VALID_RANGE_MAX) { | |
| 85 encodedLength += 4; | |
| 86 } | |
| 87 } | |
| 88 | |
| 89 List<int> encoded = new List<int>(encodedLength); | |
| 90 int insertAt = 0; | |
| 91 for (int value in source) { | |
| 92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | |
| 93 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); | |
| 94 insertAt += 3; | |
| 95 } else if (value <= _UTF8_ONE_BYTE_MAX) { | |
| 96 encoded[insertAt] = value; | |
| 97 insertAt++; | |
| 98 } else if (value <= _UTF8_TWO_BYTE_MAX) { | |
| 99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( | |
| 100 _UTF8_FIRST_BYTE_OF_TWO_MASK & | |
| 101 _addToEncoding(insertAt, 1, value, encoded)); | |
| 102 insertAt += 2; | |
| 103 } else if (value <= _UTF8_THREE_BYTE_MAX) { | |
| 104 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | ( | |
| 105 _UTF8_FIRST_BYTE_OF_THREE_MASK & | |
| 106 _addToEncoding(insertAt, 2, value, encoded)); | |
| 107 insertAt += 3; | |
| 108 } else if (value <= UNICODE_VALID_RANGE_MAX) { | |
| 109 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | ( | |
| 110 _UTF8_FIRST_BYTE_OF_FOUR_MASK & | |
| 111 _addToEncoding(insertAt, 3, value, encoded)); | |
| 112 insertAt += 4; | |
| 113 } | |
| 114 } | |
| 115 return encoded; | |
| 116 } | |
| 117 | |
| 118 // Because UTF-8 specifies byte order, we do not have to follow the pattern | |
| 119 // used by UTF-16 & UTF-32 regarding byte order. | |
| 120 List<int> utf8ToCodepoints( | |
| 121 List<int> utf8EncodedBytes, [int offset = 0, int length, | |
| 122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 123 return new Utf8Decoder(utf8EncodedBytes, offset, length, | |
| 124 replacementCodepoint).decodeRest(); | |
| 125 } | |
| 126 | |
| 127 /** | |
| 128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type | |
| 129 * provides an iterator on demand and the iterator will only translate bytes | |
| 130 * as requested by the user of the iterator. (Note: results are not cached.) | |
| 131 */ | |
| 132 // TODO(floitsch): Consider removing the extend and switch to implements since | |
| 133 // that's cheaper to allocate. | |
| 134 class IterableUtf8Decoder extends IterableBase<int> { | |
| 135 final List<int> bytes; | |
| 136 final int offset; | |
| 137 final int length; | |
| 138 final int replacementCodepoint; | |
| 139 | |
| 140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, | |
| 141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | |
| 142 | |
| 143 Utf8Decoder get iterator => | |
| 144 new Utf8Decoder(bytes, offset, length, replacementCodepoint); | |
| 145 } | |
| 146 | |
| 147 /** | |
| 148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | |
| 149 * parameters can set an offset into a list of bytes (as int), limit the length | |
| 150 * of the values to be decoded, and override the default Unicode replacement | |
| 151 * character. Set the replacementCharacter to null to throw an | |
| 152 * ArgumentError rather than replace the bad value. The return value | |
| 153 * from this method can be used as an Iterable (e.g. in a for-loop). | |
| 154 */ | |
| 155 class Utf8Decoder implements Iterator<int> { | |
| 156 final _ListRangeIterator utf8EncodedBytesIterator; | |
| 157 final int replacementCodepoint; | |
| 158 int _current = null; | |
| 159 | |
| 160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | |
| 161 this.replacementCodepoint = | |
| 162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
| 163 utf8EncodedBytesIterator = | |
| 164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator; | |
| 165 | |
| 166 | |
| 167 Utf8Decoder._fromListRangeIterator(_ListRange source, [ | |
| 168 this.replacementCodepoint = | |
| 169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
| 170 utf8EncodedBytesIterator = source.iterator; | |
| 171 | |
| 172 /** Decode the remaininder of the characters in this decoder | |
| 173 * into a [List<int>]. | |
| 174 */ | |
| 175 List<int> decodeRest() { | |
| 176 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); | |
| 177 int i = 0; | |
| 178 while (moveNext()) { | |
| 179 codepoints[i++] = current; | |
| 180 } | |
| 181 if (i == codepoints.length) { | |
| 182 return codepoints; | |
| 183 } else { | |
| 184 List<int> truncCodepoints = new List<int>(i); | |
| 185 truncCodepoints.setRange(0, i, codepoints); | |
| 186 return truncCodepoints; | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 int get current => _current; | |
| 191 | |
| 192 bool moveNext() { | |
| 193 _current = null; | |
| 194 | |
| 195 if (!utf8EncodedBytesIterator.moveNext()) return false; | |
| 196 | |
| 197 int value = utf8EncodedBytesIterator.current; | |
| 198 int additionalBytes = 0; | |
| 199 | |
| 200 if (value < 0) { | |
| 201 if (replacementCodepoint != null) { | |
| 202 _current = replacementCodepoint; | |
| 203 return true; | |
| 204 } else { | |
| 205 throw new ArgumentError( | |
| 206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | |
| 207 } | |
| 208 } else if (value <= _UTF8_ONE_BYTE_MAX) { | |
| 209 _current = value; | |
| 210 return true; | |
| 211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | |
| 212 if (replacementCodepoint != null) { | |
| 213 _current = replacementCodepoint; | |
| 214 return true; | |
| 215 } else { | |
| 216 throw new ArgumentError( | |
| 217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | |
| 218 } | |
| 219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | |
| 220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | |
| 221 additionalBytes = 1; | |
| 222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | |
| 223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | |
| 224 additionalBytes = 2; | |
| 225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | |
| 226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | |
| 227 additionalBytes = 3; | |
| 228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | |
| 229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | |
| 230 additionalBytes = 4; | |
| 231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | |
| 232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | |
| 233 additionalBytes = 5; | |
| 234 } else if (replacementCodepoint != null) { | |
| 235 _current = replacementCodepoint; | |
| 236 return true; | |
| 237 } else { | |
| 238 throw new ArgumentError( | |
| 239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | |
| 240 } | |
| 241 int j = 0; | |
| 242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { | |
| 243 int nextValue = utf8EncodedBytesIterator.current; | |
| 244 if (nextValue > _UTF8_ONE_BYTE_MAX && | |
| 245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | |
| 246 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | |
| 247 } else { | |
| 248 // if sequence-starting code unit, reposition cursor to start here | |
| 249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { | |
| 250 utf8EncodedBytesIterator.backup(); | |
| 251 } | |
| 252 break; | |
| 253 } | |
| 254 j++; | |
| 255 } | |
| 256 bool validSequence = (j == additionalBytes && ( | |
| 257 value < UNICODE_UTF16_RESERVED_LO || | |
| 258 value > UNICODE_UTF16_RESERVED_HI)); | |
| 259 bool nonOverlong = | |
| 260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | |
| 261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | |
| 262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | |
| 263 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | |
| 264 if (validSequence && nonOverlong && inRange) { | |
| 265 _current = value; | |
| 266 return true; | |
| 267 } else if (replacementCodepoint != null) { | |
| 268 _current = replacementCodepoint; | |
| 269 return true; | |
| 270 } else { | |
| 271 throw new ArgumentError( | |
| 272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | |
| 273 } | |
| 274 } | |
| 275 } | |
| OLD | NEW |