Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(345)

Side by Side Diff: dart/utils/utf8/utf8.dart

Issue 9185046: Move UTF-8 decoder to utils. (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « dart/frog/leg/scanner/vm_scanner_bench.dart ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 /** 5 #library('utf8');
6 * An abstract string representation. 6
7 */ 7 class Utf8Decoder {
8 class ByteString implements SourceString {
9 final List<int> bytes; 8 final List<int> bytes;
10 final int offset; 9 final int offset;
11 final int length; 10 final int length;
12 int _hashCode;
13 11
14 ByteString(List<int> this.bytes, int this.offset, int this.length); 12 Utf8Decoder(List<int> this.bytes, int this.offset, int this.length);
15
16 abstract String get charset();
17 13
18 String toString() { 14 String toString() {
19 var list; 15 return new String.fromCharCodes(decodeUtf8(bytes.getRange(offset, length)));
20 try {
21 list = bytes.getRange(offset, length);
22 } catch (var ignored) {
23 // An exception may occur when running this on node. This is
24 // because [bytes] really is a buffer (or typed array).
25 list = new List<int>(length);
26 for (int i = 0; i < length; i++) {
27 list[i] = bytes[i + offset];
28 }
29 }
30 return new String.fromCharCodes(decodeUtf8(list));
31 } 16 }
32 17
33 static int decodeTrailing(int byte) { 18 static int decodeTrailing(int byte) {
34 if (byte < 0x80 || 0xBF < byte) { 19 if (byte < 0x80 || 0xBF < byte) {
35 throw new MalformedInputException('Cannot decode UTF-8 $byte'); 20 throw new Exception('Cannot decode UTF-8 $byte');
36 } else { 21 } else {
37 return byte & 0x3F; 22 return byte & 0x3F;
38 } 23 }
39 } 24 }
40 25
41 static List<int> decodeUtf8(List<int> bytes) { 26 static List<int> decodeUtf8(List<int> bytes) {
42 List<int> result = new List<int>(); 27 List<int> result = new List<int>();
43 for (int i = 0; i < bytes.length; i++) { 28 for (int i = 0; i < bytes.length; i++) {
44 if (bytes[i] < 0x80) { 29 if (bytes[i] < 0x80) {
45 result.add(bytes[i]); 30 result.add(bytes[i]);
46 } else if (bytes[i] < 0xC2) { 31 } else if (bytes[i] < 0xC2) {
47 throw new MalformedInputException('Cannot decode UTF-8 @ $i'); 32 throw new Exception('Cannot decode UTF-8 @ $i');
48 } else if (bytes[i] < 0xE0) { 33 } else if (bytes[i] < 0xE0) {
49 int char = (bytes[i++] & 0x1F) << 6; 34 int char = (bytes[i++] & 0x1F) << 6;
50 char += decodeTrailing(bytes[i]); 35 char += decodeTrailing(bytes[i]);
51 if (char < 0x80) { 36 if (char < 0x80) {
52 throw new MalformedInputException('Cannot decode UTF-8 @ ${i-1}'); 37 throw new Exception('Cannot decode UTF-8 @ ${i-1}');
53 } else { 38 } else {
54 result.add(char); 39 result.add(char);
55 } 40 }
56 } else if (bytes[i] < 0xF0) { 41 } else if (bytes[i] < 0xF0) {
57 int char = (bytes[i++] & 0x0F) << 6; 42 int char = (bytes[i++] & 0x0F) << 6;
58 char += decodeTrailing(bytes[i++]); 43 char += decodeTrailing(bytes[i++]);
59 char <<= 6; 44 char <<= 6;
60 char += decodeTrailing(bytes[i]); 45 char += decodeTrailing(bytes[i]);
61 if (char < 0x800 || (0xD800 <= char && char <= 0xDFFF)) { 46 if (char < 0x800 || (0xD800 <= char && char <= 0xDFFF)) {
62 throw new MalformedInputException('Cannot decode UTF-8 @ ${i-2}'); 47 throw new Exception('Cannot decode UTF-8 @ ${i-2}');
63 } else { 48 } else {
64 result.add(char); 49 result.add(char);
65 } 50 }
66 } else if (bytes[i] < 0xF8) { 51 } else if (bytes[i] < 0xF8) {
67 int char = (bytes[i++] & 0x07) << 6; 52 int char = (bytes[i++] & 0x07) << 6;
68 char += decodeTrailing(bytes[i++]); 53 char += decodeTrailing(bytes[i++]);
69 char <<= 6; 54 char <<= 6;
70 char += decodeTrailing(bytes[i++]); 55 char += decodeTrailing(bytes[i++]);
71 char <<= 6; 56 char <<= 6;
72 char += decodeTrailing(bytes[i]); 57 char += decodeTrailing(bytes[i]);
73 if (char < 0x10000) { 58 if (char < 0x10000) {
74 throw new MalformedInputException('Cannot decode UTF-8 @ ${i-3}'); 59 throw new Exception('Cannot decode UTF-8 @ ${i-3}');
75 } else { 60 } else {
76 result.add(char); 61 result.add(char);
77 } 62 }
78 } else { 63 } else {
79 throw new MalformedInputException('Cannot decode UTF-8 @ $i'); 64 throw new Exception('Cannot decode UTF-8 @ $i');
80 } 65 }
81 } 66 }
82 return result; 67 return result;
83 } 68 }
84
85 bool operator ==(other) {
86 throw "should be overridden in subclass";
87 }
88
89 int hashCode() {
90 if (_hashCode === null) {
91 _hashCode = computeHashCode();
92 }
93 return _hashCode;
94 }
95
96 int computeHashCode() {
97 int code = 1;
98 int end = offset + length;
99 for (int i = offset; i < end; i++) {
100 code += 19 * code + bytes[i];
101 }
102 return code;
103 }
104
105 printOn(StringBuffer sb) {
106 sb.add(toString());
107 }
108 } 69 }
109
110 /**
111 * A string that consists purely of 7bit ASCII characters.
112 */
113 class AsciiString extends ByteString {
114 final String charset = "ASCII";
115
116 AsciiString(List<int> bytes, int offset, int length)
117 : super(bytes, offset, length);
118
119 static AsciiString of(List<int> bytes, int offset, int length) {
120 AsciiString string = new AsciiString(bytes, offset, length);
121 return string;
122 }
123
124 static AsciiString fromString(String string) {
125 List<int> bytes = string.charCodes();
126 return AsciiString.of(bytes, 0, bytes.length);
127 }
128 }
129
130 /**
131 * A string that consists of characters that can be encoded as UTF-8.
132 */
133 class Utf8String extends ByteString {
134 final String charset = "UTF8";
135
136 Utf8String(List<int> bytes, int offset, int length)
137 : super(bytes, offset, length);
138
139 static Utf8String of(List<int> bytes, int offset, int length) {
140 return new Utf8String(bytes, offset, length);
141 }
142
143 static Utf8String fromString(String string) {
144 throw "not implemented yet";
145 }
146 }
147
148 /**
149 * A ByteString-valued token.
150 */
151 class ByteStringToken extends Token {
152 final ByteString value;
153
154 ByteStringToken(PrecedenceInfo info, ByteString this.value, int charOffset)
155 : super(info, charOffset);
156
157 String toString() => value.toString();
158 }
OLDNEW
« no previous file with comments | « dart/frog/leg/scanner/vm_scanner_bench.dart ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698