Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/unicode.h

Issue 638643002: Update unicode to 7.0.0. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: addressed comment Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/scanner.h ('k') | src/unicode.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #ifndef V8_UNICODE_H_ 5 #ifndef V8_UNICODE_H_
6 #define V8_UNICODE_H_ 6 #define V8_UNICODE_H_
7 7
8 #include <sys/types.h> 8 #include <sys/types.h>
9 #include "src/globals.h" 9 #include "src/globals.h"
10 /** 10 /**
(...skipping 26 matching lines...) Expand all
37 : code_point_(code_point), 37 : code_point_(code_point),
38 value_(value) { } 38 value_(value) { }
39 uchar code_point_ : 21; 39 uchar code_point_ : 21;
40 bool value_ : 1; 40 bool value_ : 1;
41 }; 41 };
42 static const int kSize = size; 42 static const int kSize = size;
43 static const int kMask = kSize - 1; 43 static const int kMask = kSize - 1;
44 CacheEntry entries_[kSize]; 44 CacheEntry entries_[kSize];
45 }; 45 };
46 46
47
47 // A cache used in case conversion. It caches the value for characters 48 // A cache used in case conversion. It caches the value for characters
48 // that either have no mapping or map to a single character independent 49 // that either have no mapping or map to a single character independent
49 // of context. Characters that map to more than one character or that 50 // of context. Characters that map to more than one character or that
50 // map differently depending on context are always looked up. 51 // map differently depending on context are always looked up.
51 template <class T, int size = 256> 52 template <class T, int size = 256>
52 class Mapping { 53 class Mapping {
53 public: 54 public:
54 inline Mapping() { } 55 inline Mapping() { }
55 inline int get(uchar c, uchar n, uchar* result); 56 inline int get(uchar c, uchar n, uchar* result);
56 private: 57 private:
57 friend class Test; 58 friend class Test;
58 int CalculateValue(uchar c, uchar n, uchar* result); 59 int CalculateValue(uchar c, uchar n, uchar* result);
59 struct CacheEntry { 60 struct CacheEntry {
60 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } 61 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
61 inline CacheEntry(uchar code_point, signed offset) 62 inline CacheEntry(uchar code_point, signed offset)
62 : code_point_(code_point), 63 : code_point_(code_point),
63 offset_(offset) { } 64 offset_(offset) { }
64 uchar code_point_; 65 uchar code_point_;
65 signed offset_; 66 signed offset_;
66 static const int kNoChar = (1 << 21) - 1; 67 static const int kNoChar = (1 << 21) - 1;
67 }; 68 };
68 static const int kSize = size; 69 static const int kSize = size;
69 static const int kMask = kSize - 1; 70 static const int kMask = kSize - 1;
70 CacheEntry entries_[kSize]; 71 CacheEntry entries_[kSize];
71 }; 72 };
72 73
74
73 class UnicodeData { 75 class UnicodeData {
74 private: 76 private:
75 friend class Test; 77 friend class Test;
76 static int GetByteCount(); 78 static int GetByteCount();
77 static const uchar kMaxCodePoint; 79 static const uchar kMaxCodePoint;
78 }; 80 };
79 81
82
80 class Utf16 { 83 class Utf16 {
81 public: 84 public:
82 static inline bool IsSurrogatePair(int lead, int trail) { 85 static inline bool IsSurrogatePair(int lead, int trail) {
83 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); 86 return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
84 } 87 }
85 static inline bool IsLeadSurrogate(int code) { 88 static inline bool IsLeadSurrogate(int code) {
86 if (code == kNoPreviousCharacter) return false; 89 if (code == kNoPreviousCharacter) return false;
87 return (code & 0xfc00) == 0xd800; 90 return (code & 0xfc00) == 0xd800;
88 } 91 }
89 static inline bool IsTrailSurrogate(int code) { 92 static inline bool IsTrailSurrogate(int code) {
(...skipping 16 matching lines...) Expand all
106 // The illegality stems from the surrogate not being part of a pair. 109 // The illegality stems from the surrogate not being part of a pair.
107 static const int kUtf8BytesToCodeASurrogate = 3; 110 static const int kUtf8BytesToCodeASurrogate = 3;
108 static inline uint16_t LeadSurrogate(uint32_t char_code) { 111 static inline uint16_t LeadSurrogate(uint32_t char_code) {
109 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); 112 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
110 } 113 }
111 static inline uint16_t TrailSurrogate(uint32_t char_code) { 114 static inline uint16_t TrailSurrogate(uint32_t char_code) {
112 return 0xdc00 + (char_code & 0x3ff); 115 return 0xdc00 + (char_code & 0x3ff);
113 } 116 }
114 }; 117 };
115 118
116 class Latin1 {
117 public:
118 static const unsigned kMaxChar = 0xff;
119 // Returns 0 if character does not convert to single latin-1 character
120 // or if the character doesn't not convert back to latin-1 via inverse
121 // operation (upper to lower, etc).
122 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
123 };
124 119
125 class Utf8 { 120 class Utf8 {
126 public: 121 public:
127 static inline uchar Length(uchar chr, int previous); 122 static inline uchar Length(uchar chr, int previous);
128 static inline unsigned EncodeOneByte(char* out, uint8_t c); 123 static inline unsigned EncodeOneByte(char* out, uint8_t c);
129 static inline unsigned Encode(char* out, 124 static inline unsigned Encode(char* out,
130 uchar c, 125 uchar c,
131 int previous, 126 int previous,
132 bool replace_invalid = false); 127 bool replace_invalid = false);
133 static uchar CalculateValue(const byte* str, 128 static uchar CalculateValue(const byte* str,
(...skipping 14 matching lines...) Expand all
148 static const unsigned kBytesSavedByCombiningSurrogates = 2; 143 static const unsigned kBytesSavedByCombiningSurrogates = 2;
149 static const unsigned kSizeOfUnmatchedSurrogate = 3; 144 static const unsigned kSizeOfUnmatchedSurrogate = 3;
150 // The maximum size a single UTF-16 code unit may take up when encoded as 145 // The maximum size a single UTF-16 code unit may take up when encoded as
151 // UTF-8. 146 // UTF-8.
152 static const unsigned kMax16BitCodeUnitSize = 3; 147 static const unsigned kMax16BitCodeUnitSize = 3;
153 static inline uchar ValueOf(const byte* str, 148 static inline uchar ValueOf(const byte* str,
154 unsigned length, 149 unsigned length,
155 unsigned* cursor); 150 unsigned* cursor);
156 }; 151 };
157 152
158
159 class Utf8DecoderBase {
160 public:
161 // Initialization done in subclass.
162 inline Utf8DecoderBase();
163 inline Utf8DecoderBase(uint16_t* buffer,
164 unsigned buffer_length,
165 const uint8_t* stream,
166 unsigned stream_length);
167 inline unsigned Utf16Length() const { return utf16_length_; }
168 protected:
169 // This reads all characters and sets the utf16_length_.
170 // The first buffer_length utf16 chars are cached in the buffer.
171 void Reset(uint16_t* buffer,
172 unsigned buffer_length,
173 const uint8_t* stream,
174 unsigned stream_length);
175 static void WriteUtf16Slow(const uint8_t* stream,
176 uint16_t* data,
177 unsigned length);
178 const uint8_t* unbuffered_start_;
179 unsigned utf16_length_;
180 bool last_byte_of_buffer_unused_;
181 private:
182 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
183 };
184
185 template <unsigned kBufferSize>
186 class Utf8Decoder : public Utf8DecoderBase {
187 public:
188 inline Utf8Decoder() {}
189 inline Utf8Decoder(const char* stream, unsigned length);
190 inline void Reset(const char* stream, unsigned length);
191 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
192 private:
193 uint16_t buffer_[kBufferSize];
194 };
195
196
197 struct Uppercase { 153 struct Uppercase {
198 static bool Is(uchar c); 154 static bool Is(uchar c);
199 }; 155 };
200 struct Lowercase { 156 struct Lowercase {
201 static bool Is(uchar c); 157 static bool Is(uchar c);
202 }; 158 };
203 struct Letter { 159 struct Letter {
204 static bool Is(uchar c); 160 static bool Is(uchar c);
205 }; 161 };
206 struct Number { 162 struct ID_Start {
163 static bool Is(uchar c);
164 };
165 struct ID_Continue {
207 static bool Is(uchar c); 166 static bool Is(uchar c);
208 }; 167 };
209 struct WhiteSpace { 168 struct WhiteSpace {
210 static bool Is(uchar c); 169 static bool Is(uchar c);
211 }; 170 };
212 struct LineTerminator { 171 struct LineTerminator {
213 static bool Is(uchar c); 172 static bool Is(uchar c);
214 }; 173 };
215 struct CombiningMark {
216 static bool Is(uchar c);
217 };
218 struct ConnectorPunctuation {
219 static bool Is(uchar c);
220 };
221 struct ToLowercase { 174 struct ToLowercase {
222 static const int kMaxWidth = 3; 175 static const int kMaxWidth = 3;
223 static const bool kIsToLower = true; 176 static const bool kIsToLower = true;
224 static int Convert(uchar c, 177 static int Convert(uchar c,
225 uchar n, 178 uchar n,
226 uchar* result, 179 uchar* result,
227 bool* allow_caching_ptr); 180 bool* allow_caching_ptr);
228 }; 181 };
229 struct ToUppercase { 182 struct ToUppercase {
230 static const int kMaxWidth = 3; 183 static const int kMaxWidth = 3;
(...skipping 21 matching lines...) Expand all
252 static const int kMaxWidth = 1; 205 static const int kMaxWidth = 1;
253 static int Convert(uchar c, 206 static int Convert(uchar c,
254 uchar n, 207 uchar n,
255 uchar* result, 208 uchar* result,
256 bool* allow_caching_ptr); 209 bool* allow_caching_ptr);
257 }; 210 };
258 211
259 } // namespace unibrow 212 } // namespace unibrow
260 213
261 #endif // V8_UNICODE_H_ 214 #endif // V8_UNICODE_H_
OLDNEW
« no previous file with comments | « src/scanner.h ('k') | src/unicode.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698