OLD | NEW |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #ifndef VM_UNICODE_H_ | 5 #ifndef VM_UNICODE_H_ |
6 #define VM_UNICODE_H_ | 6 #define VM_UNICODE_H_ |
7 | 7 |
8 #include "vm/allocation.h" | 8 #include "vm/allocation.h" |
9 #include "vm/globals.h" | 9 #include "vm/globals.h" |
10 | 10 |
11 namespace dart { | 11 namespace dart { |
12 | 12 |
13 class String; | 13 class String; |
14 | 14 |
15 | |
16 class Utf8 : AllStatic { | 15 class Utf8 : AllStatic { |
17 public: | 16 public: |
18 enum Type { | 17 enum Type { |
19 kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF]. | 18 kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF]. |
20 kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF]. | 19 kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF]. |
21 kSupplementary, // Supplementary code point [U+010000, U+10FFFF]. | 20 kSupplementary, // Supplementary code point [U+010000, U+10FFFF]. |
22 }; | 21 }; |
23 | 22 |
24 static const intptr_t kMaxOneByteChar = 0x7F; | 23 static const intptr_t kMaxOneByteChar = 0x7F; |
25 static const intptr_t kMaxTwoByteChar = 0x7FF; | 24 static const intptr_t kMaxTwoByteChar = 0x7FF; |
26 static const intptr_t kMaxThreeByteChar = 0xFFFF; | 25 static const intptr_t kMaxThreeByteChar = 0xFFFF; |
27 static const intptr_t kMaxFourByteChar = 0x10FFFF; | 26 static const intptr_t kMaxFourByteChar = 0x10FFFF; |
28 | 27 |
29 static const int32_t kInvalidCodePoint = -1; | 28 static intptr_t CodePointCount(const uint8_t* utf8_array, |
30 | 29 intptr_t array_len, |
31 static intptr_t CodeUnitCount(const uint8_t* utf8_array, | 30 Type* type); |
32 intptr_t array_len, | |
33 Type* type); | |
34 | 31 |
35 // Returns true if 'utf8_array' is a valid UTF-8 string. | 32 // Returns true if 'utf8_array' is a valid UTF-8 string. |
36 static bool IsValid(const uint8_t* utf8_array, intptr_t array_len); | 33 static bool IsValid(const uint8_t* utf8_array, intptr_t array_len); |
37 | 34 |
38 static intptr_t Length(int32_t ch); | 35 static intptr_t Length(int32_t ch); |
39 static intptr_t Length(const String& str); | 36 static intptr_t Length(const String& str); |
40 | 37 |
41 static intptr_t Encode(int32_t ch, char* dst); | 38 static intptr_t Encode(int32_t ch, char* dst); |
42 static intptr_t Encode(const String& src, char* dst, intptr_t len); | 39 static intptr_t Encode(const String& src, char* dst, intptr_t len); |
43 | 40 |
44 static intptr_t Decode(const uint8_t* utf8_array, | 41 static intptr_t Decode(const uint8_t* utf8_array, |
45 intptr_t array_len, | 42 intptr_t array_len, |
46 int32_t* ch); | 43 int32_t* ch); |
47 | 44 |
48 static bool DecodeToLatin1(const uint8_t* utf8_array, | 45 static bool DecodeToLatin1(const uint8_t* utf8_array, |
49 intptr_t array_len, | 46 intptr_t array_len, |
50 uint8_t* dst, | 47 uint8_t* dst, |
51 intptr_t len); | 48 intptr_t len); |
52 static bool DecodeToUTF16(const uint8_t* utf8_array, | 49 static bool DecodeToUTF16(const uint8_t* utf8_array, |
53 intptr_t array_len, | 50 intptr_t array_len, |
54 uint16_t* dst, | 51 uint16_t* dst, |
55 intptr_t len); | 52 intptr_t len); |
56 static bool DecodeToUTF32(const uint8_t* utf8_array, | 53 static bool DecodeToUTF32(const uint8_t* utf8_array, |
57 intptr_t array_len, | 54 intptr_t array_len, |
58 int32_t* dst, | 55 uint32_t* dst, |
59 intptr_t len); | 56 intptr_t len); |
60 static bool DecodeCStringToUTF32(const char* str, | 57 static bool DecodeCStringToUTF32(const char* str, |
61 int32_t* dst, | 58 uint32_t* dst, |
62 intptr_t len) { | 59 intptr_t len) { |
63 ASSERT(str != NULL); | 60 ASSERT(str != NULL); |
64 intptr_t array_len = strlen(str); | 61 intptr_t array_len = strlen(str); |
65 const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str); | 62 const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str); |
66 return DecodeToUTF32(utf8_array, array_len, dst, len); | 63 return DecodeToUTF32(utf8_array, array_len, dst, len); |
67 } | 64 } |
68 }; | 65 }; |
69 | 66 |
70 | 67 |
71 class Utf16 : AllStatic { | 68 class Utf16 : AllStatic { |
72 public: | 69 public: |
73 static const int32_t kMaxBmpCodepoint = 0xFFFF; | 70 static const int32_t kMaxBmpCodepoint = 0xFFFF; |
74 static const int32_t kMaxCodeUnit = 0xFFFF; | |
75 static const int32_t kMaxCodePoint = 0x10FFFF; | |
76 | 71 |
77 static const int32_t kSurrogateEncodingBase = 0x10000; | 72 static const int32_t kLeadSurrogateOffset = (0xD800 - (0x10000 >> 10)); |
| 73 |
| 74 static const int32_t kSurrogateOffset = (0x10000 - (0xD800 << 10) - 0xDC00); |
78 | 75 |
79 // Returns the length of the code point in UTF-16 code units. | 76 // Returns the length of the code point in UTF-16 code units. |
80 static intptr_t Length(int32_t ch) { | 77 static intptr_t Length(int32_t ch) { |
81 return (ch <= kMaxBmpCodepoint) ? 1 : 2; | 78 return (ch <= kMaxBmpCodepoint) ? 1 : 2; |
82 } | 79 } |
83 | 80 |
84 // Returns true if ch is a lead or trail surrogate. | 81 // Returns true if ch is a lead or trail surrogate. |
85 static bool IsSurrogate(int32_t ch) { | 82 static bool IsSurrogate(int32_t ch) { |
86 return (ch & 0xFFFFF800) == 0xD800; | 83 return (ch & 0xFFFFF800) == 0xD800; |
87 } | 84 } |
88 | 85 |
89 // Returns true if ch is a lead surrogate. | 86 // Returns true if ch is a lead surrogate. |
90 static bool IsLeadSurrogate(int32_t ch) { | 87 static bool IsLeadSurrogate(int32_t ch) { |
91 return (ch & 0xFFFFFC00) == 0xD800; | 88 return (ch & 0xFFFFFC00) == 0xD800; |
92 } | 89 } |
93 | 90 |
94 // Returns true if ch is a low surrogate. | 91 // Returns true if ch is a low surrogate. |
95 static bool IsTrailSurrogate(int32_t ch) { | 92 static bool IsTrailSurrogate(int32_t ch) { |
96 return (ch & 0xFFFFFC00) == 0xDC00; | 93 return (ch & 0xFFFFFC00) == 0xDC00; |
97 } | 94 } |
98 | 95 |
99 // Decodes a surrogate pair into a supplementary code point. | 96 // Decodes a surrogate pair into a supplementary code point. |
100 static int32_t Decode(int32_t lead, int32_t trail) { | 97 static int32_t Decode(int32_t lead, int32_t trail) { |
101 ASSERT(IsLeadSurrogate(lead)); | 98 return 0x10000 + ((lead & 0x3FF) << 10) + (trail & 0x3FF); |
102 ASSERT(IsTrailSurrogate(trail)); | |
103 return kSurrogateEncodingBase + | |
104 ((lead & kSurrogateMask) << 10) + (trail & kSurrogateMask); | |
105 } | |
106 | |
107 static int32_t LeadFromCodePoint(int32_t code_point) { | |
108 ASSERT(code_point >= kSurrogateEncodingBase); | |
109 return kLeadBase + | |
110 (((code_point - kSurrogateEncodingBase) >> 10) & kSurrogateMask); | |
111 } | |
112 | |
113 static int32_t TrailFromCodePoint(int32_t code_point) { | |
114 ASSERT(code_point >= kSurrogateEncodingBase); | |
115 return kTrailBase + (code_point & kSurrogateMask); | |
116 } | 99 } |
117 | 100 |
118 // Encodes a single code point. | 101 // Encodes a single code point. |
119 static void Encode(int32_t codepoint, uint16_t* dst); | 102 static void Encode(int32_t codepoint, uint16_t* dst); |
120 | |
121 // Gets the 21 bit Unicode code point at the given index in a string. If the | |
122 // returned value is greater than kMaxCodePoint then the next position of the | |
123 // string encodes a trail surrogate and should be skipped on iteration. May | |
124 // return individual surrogate values if they are not part of a pair. | |
125 static int32_t CodePointAt(const String& str, int index); | |
126 | |
127 private: | |
128 static const int32_t kLeadBase = 0xD800; | |
129 static const int32_t kTrailBase = 0xDC00; | |
130 static const int32_t kSurrogateMask = 0x3FF; | |
131 }; | 103 }; |
132 | 104 |
133 | 105 |
134 class CaseMapping : AllStatic { | 106 class CaseMapping : AllStatic { |
135 public: | 107 public: |
136 // Maps a code point to uppercase. | 108 // Maps a code point to uppercase. |
137 static int32_t ToUpper(int32_t code_point) { | 109 static int32_t ToUpper(int32_t code_point) { |
138 return Convert(code_point, kUppercase); | 110 return Convert(code_point, kUppercase); |
139 } | 111 } |
140 | 112 |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
190 // Data for small code points with one mapping | 162 // Data for small code points with one mapping |
191 static const int16_t stage2_[]; | 163 static const int16_t stage2_[]; |
192 | 164 |
193 // Data for large code points or code points with both mappings. | 165 // Data for large code points or code points with both mappings. |
194 static const int32_t stage2_exception_[][2]; | 166 static const int32_t stage2_exception_[][2]; |
195 }; | 167 }; |
196 | 168 |
197 } // namespace dart | 169 } // namespace dart |
198 | 170 |
199 #endif // VM_UNICODE_H_ | 171 #endif // VM_UNICODE_H_ |
OLD | NEW |