| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 #ifndef UNICODE_H_ | |
| 6 #define UNICODE_H_ | |
| 7 | |
| 8 #include "struct.h" | |
| 9 | |
| 10 class Utf { | |
| 11 public: | |
| 12 static const int32_t kMaxCodePoint = 0x10FFFF; | |
| 13 | |
| 14 static bool IsLatin1(int32_t code_point) { | |
| 15 return (code_point >= 0) && (code_point <= 0xFF); | |
| 16 } | |
| 17 | |
| 18 static bool IsBmp(int32_t code_point) { | |
| 19 return (code_point >= 0) && (code_point <= 0xFFFF); | |
| 20 } | |
| 21 | |
| 22 static bool IsSupplementary(int32_t code_point) { | |
| 23 return (code_point > 0xFFFF) && (code_point <= kMaxCodePoint); | |
| 24 } | |
| 25 | |
| 26 // Returns true if the code point value is above Plane 17. | |
| 27 static bool IsOutOfRange(intptr_t code_point) { | |
| 28 return (code_point < 0) || (code_point > kMaxCodePoint); | |
| 29 } | |
| 30 }; | |
| 31 | |
| 32 class Utf8 { | |
| 33 public: | |
| 34 enum Type { | |
| 35 kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF]. | |
| 36 kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF]. | |
| 37 kSupplementary, // Supplementary code point [U+010000, U+10FFFF]. | |
| 38 }; | |
| 39 | |
| 40 // Returns the most restricted coding form in which the sequence of utf8 | |
| 41 // characters in 'utf8_array' can be represented in, and the number of | |
| 42 // code units needed in that form. | |
| 43 static intptr_t CodeUnitCount(const char* utf8_array, | |
| 44 intptr_t array_len, | |
| 45 Type* type); | |
| 46 | |
| 47 static intptr_t Length(int32_t ch); | |
| 48 static intptr_t Length(List<uint16_t> str); | |
| 49 | |
| 50 static intptr_t Encode(int32_t ch, char* dst); | |
| 51 static intptr_t Encode(List<uint16_t> str, char* dst, intptr_t len); | |
| 52 | |
| 53 static intptr_t Decode(const char* utf8_array, | |
| 54 intptr_t array_len, | |
| 55 int32_t* ch); | |
| 56 static bool DecodeToUTF16(const char* utf8_array, | |
| 57 intptr_t array_len, | |
| 58 uint16_t* dst, | |
| 59 intptr_t len); | |
| 60 | |
| 61 static const int32_t kMaxOneByteChar = 0x7F; | |
| 62 static const int32_t kMaxTwoByteChar = 0x7FF; | |
| 63 static const int32_t kMaxThreeByteChar = 0xFFFF; | |
| 64 static const int32_t kMaxFourByteChar = Utf::kMaxCodePoint; | |
| 65 | |
| 66 private: | |
| 67 static bool IsTrailByte(uint8_t code_unit) { | |
| 68 return (code_unit & 0xC0) == 0x80; | |
| 69 } | |
| 70 | |
| 71 static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) { | |
| 72 return code_point < kOverlongMinimum[num_code_units]; | |
| 73 } | |
| 74 | |
| 75 static bool IsLatin1SequenceStart(uint8_t code_unit) { | |
| 76 // Check if utf8 sequence is the start of a codepoint <= U+00FF | |
| 77 return (code_unit <= 0xC3); | |
| 78 } | |
| 79 | |
| 80 static bool IsSupplementarySequenceStart(uint8_t code_unit) { | |
| 81 // Check if utf8 sequence is the start of a codepoint >= U+10000. | |
| 82 return (code_unit >= 0xF0); | |
| 83 } | |
| 84 | |
| 85 static const int8_t kTrailBytes[]; | |
| 86 static const uint32_t kMagicBits[]; | |
| 87 static const uint32_t kOverlongMinimum[]; | |
| 88 }; | |
| 89 | |
| 90 class Utf16 { | |
| 91 public: | |
| 92 // Returns the length of the code point in UTF-16 code units. | |
| 93 static intptr_t Length(int32_t ch) { | |
| 94 return (ch <= Utf16::kMaxCodeUnit) ? 1 : 2; | |
| 95 } | |
| 96 | |
| 97 // Returns true if ch is a lead or trail surrogate. | |
| 98 static bool IsSurrogate(int32_t ch) { | |
| 99 return (ch & 0xFFFFF800) == 0xD800; | |
| 100 } | |
| 101 | |
| 102 // Returns true if ch is a lead surrogate. | |
| 103 static bool IsLeadSurrogate(int32_t ch) { | |
| 104 return (ch & 0xFFFFFC00) == 0xD800; | |
| 105 } | |
| 106 | |
| 107 // Returns true if ch is a low surrogate. | |
| 108 static bool IsTrailSurrogate(int32_t ch) { | |
| 109 return (ch & 0xFFFFFC00) == 0xDC00; | |
| 110 } | |
| 111 | |
| 112 // Returns the character at i and advances i to the next character | |
| 113 // boundary. | |
| 114 static int32_t Next(const uint16_t* characters, intptr_t* i, intptr_t len) { | |
| 115 int32_t ch = characters[*i]; | |
| 116 if (Utf16::IsLeadSurrogate(ch) && (*i < (len - 1))) { | |
| 117 int32_t ch2 = characters[*i + 1]; | |
| 118 if (Utf16::IsTrailSurrogate(ch2)) { | |
| 119 ch = Utf16::Decode(ch, ch2); | |
| 120 *i += 1; | |
| 121 } | |
| 122 } | |
| 123 *i += 1; | |
| 124 return ch; | |
| 125 } | |
| 126 | |
| 127 // Decodes a surrogate pair into a supplementary code point. | |
| 128 static int32_t Decode(int32_t lead, int32_t trail) { | |
| 129 return 0x10000 + ((lead & 0x3FF) << 10) + (trail & 0x3FF); | |
| 130 } | |
| 131 | |
| 132 // Encodes a single code point. | |
| 133 static void Encode(int32_t codepoint, uint16_t* dst); | |
| 134 | |
| 135 static const int32_t kMaxCodeUnit = 0xFFFF; | |
| 136 | |
| 137 private: | |
| 138 static const int32_t kLeadSurrogateOffset = (0xD800 - (0x10000 >> 10)); | |
| 139 | |
| 140 static const int32_t kSurrogateOffset = (0x10000 - (0xD800 << 10) - 0xDC00); | |
| 141 }; | |
| 142 | |
| 143 #endif // UNICODE_H_ | |
| OLD | NEW |