Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1205)

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 19187002: Replace old utf8 decoder with new one. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Add comments. Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.convert; 5 part of dart.convert;
6 6
7 /** 7 /**
8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of 8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of
9 * unsigned 8-bit integers). 9 * unsigned 8-bit integers).
10 */ 10 */
11 class Utf8Encoder extends Converter<String, List<int>> { 11 class Utf8Encoder extends Converter<String, List<int>> {
12 /** 12 /**
13 * Converts [string] to its UTF-8 code units (a list of 13 * Converts [string] to its UTF-8 code units (a list of
14 * unsigned 8-bit integers). 14 * unsigned 8-bit integers).
15 */ 15 */
16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); 16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string);
17 } 17 }
18 18
19 /** 19 /**
20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) 20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers)
21 * to a string. 21 * to a string.
22 */ 22 */
23 class Utf8Decoder extends Converter<List<int>, String> { 23 class Utf8Decoder extends Converter<List<int>, String> {
24 final bool _allowMalformed;
25
26 /**
27 * Instantiates a new [Utf8Decoder].
28 *
29 * The optional [allowMalformed] argument defines how [convert] deals
30 * with invalid or unterminated character sequences.
31 *
32 * If it is `true` [convert] replaces invalid (or unterminated) character
33 * sequences with the Unicode Replacement character `0xFFFD` (�). Otherwise
Lasse Reichstein Nielsen 2013/07/16 12:23:03 U+FFFD
floitsch 2013/07/16 14:25:24 Done.
34 * it throws a [FormatException].
35 */
36 Utf8Decoder({ bool allowMalformed: false })
37 : this._allowMalformed = allowMalformed;
38
24 /** 39 /**
25 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the 40 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
26 * corresponding string. 41 * corresponding string.
27 */ 42 */
28 // TODO(floitsch): allow to configure the decoder (for example the replacement 43 String convert(List<int> codeUnits) {
29 // character). 44 StringBuffer buffer = new StringBuffer();
30 String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); 45 _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
46 decoder.convert(codeUnits, 0, codeUnits.length, buffer);
47 decoder.close(buffer);
48 return buffer.toString();
49 }
31 } 50 }
51
52 // UTF-8 constants.
53 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
54 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
55 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
56 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.
57
58 // UTF-16 constants.
59 const int _SURROGATE_MASK = 0xF800;
60 const int _SURROGATE_TAG_MASK = 0xFC00;
61 const int _SURROGATE_VALUE_MASK = 0x3FF;
62 const int _LEAD_SURROGATE_MIN = 0xD800;
63 const int _TAIL_SURROGATE_MIN = 0xDC00;
64
65 const int _REPLACEMENT_CHARACTER = 0xFFFD;
66
67 bool _isSurrogate(int codeUnit) =>
68 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
69 bool _isLeadSurrogate(int codeUnit) =>
70 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
71 bool _isTailSurrogate(int codeUnit) =>
72 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
73 int _combineSurrogatePair(int lead, int tail) =>
74 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)
75 | (tail & _SURROGATE_VALUE_MASK);
76
77
78 /**
79 * Decodes utf-8.
Lasse Reichstein Nielsen 2013/07/16 12:23:03 UTF-8.
floitsch 2013/07/16 14:25:24 Done.
80 *
81 * The decoder handles chunked input.
82 */
83 // TODO(floitsch): do we want to make this class public?
Lasse Reichstein Nielsen 2013/07/16 12:23:03 Sure, why not?
floitsch 2013/07/16 14:25:24 Later. But rephrased TODO.
84 class _Utf8Decoder {
85 final bool _allowMalformed;
86 int _value = 0;
87 int _expectedUnits = 0;
88 int _extraUnits = 0;
89
90 _Utf8Decoder(this._allowMalformed);
91
92 bool get hasPartialInput => _expectedUnits > 0;
93
94 // Limits of one through four byte encodings.
95 static const List<int> _LIMITS = const <int>[
96 _ONE_BYTE_LIMIT,
97 _TWO_BYTE_LIMIT,
98 _THREE_BYTE_LIMIT,
99 _FOUR_BYTE_LIMIT ];
100
101 void close(StringSink sink) {
102 if (hasPartialInput) {
103 _throwIfNecessary("Unfinished UTF-8 encoding");
104 sink.writeCharCode(_REPLACEMENT_CHARACTER);
105 }
106 }
107
108 void convert(List<int> codeUnits, int startIndex, int endIndex,
109 StringSink sink) {
110 int value = _value;
111 int expectedUnits = _expectedUnits;
112 int extraUnits = _extraUnits;
113 _value = 0;
114 _expectedUnits = 0;
115 _extraUnits = 0;
116
117 int i = startIndex;
118 loop: while (true) {
119 multibyte: if (expectedUnits > 0) {
120 do {
121 if (i == endIndex) {
122 break loop;
123 }
124 int unit = codeUnits[i];
125 if ((unit & 0xC0) != 0x80) {
126 expectedUnits = 0;
127 _throwIfNecessary(
128 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
129 sink.writeCharCode(_REPLACEMENT_CHARACTER);
130 break multibyte;
131 } else {
132 value = (value << 6) | (unit & 0x3f);
133 expectedUnits--;
134 i++;
135 }
136 } while (expectedUnits > 0);
137 if (value <= _LIMITS[extraUnits - 1]) {
138 // Overly long encoding. The value could be encoded with a shorter
139 // encoding.
140 _throwIfNecessary(
141 "Overlong encoding of 0x${value.toRadixString(16)}");
142 value = _REPLACEMENT_CHARACTER;
143 }
144 sink.writeCharCode(value);
145 }
146
147 while (i < endIndex) {
148 int unit = codeUnits[i++];
149 if (unit <= _ONE_BYTE_LIMIT) {
150 sink.writeCharCode(unit);
151 } else {
152 if ((unit & 0xE0) == 0xC0) {
153 value = unit & 0x1F;
154 expectedUnits = extraUnits = 1;
155 continue loop;
156 }
157 if ((unit & 0xF0) == 0xE0) {
158 value = unit & 0x0F;
159 expectedUnits = extraUnits = 2;
160 continue loop;
161 }
162 if ((unit & 0xF8) == 0xF0) {
163 value = unit & 0x07;
164 expectedUnits = extraUnits = 3;
165 continue loop;
166 }
167 _throwIfNecessary("Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
Lasse Reichstein Nielsen 2013/07/16 12:23:03 Seems inefficient to create the string and not use
floitsch 2013/07/16 14:25:24 inlined.
168 value = _REPLACEMENT_CHARACTER;
169 expectedUnits = extraUnits = 0;
170 sink.writeCharCode(value);
171 }
172 }
173 break loop;
174 }
175 if (expectedUnits > 0) {
176 _value = value;
177 _expectedUnits = expectedUnits;
178 _extraUnits = extraUnits;
179 }
180 }
181
182 void _throwIfNecessary(String message) {
183 if (!_allowMalformed) {
184 throw new FormatException(message);
185 }
186 }
187 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698