Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(219)

Side by Side Diff: src/unicode-inl.h

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master
Patch Set: DISALLOW_INVALID_UTF8 flag and fixes Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2007-2010 the V8 project authors. All rights reserved. 1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
100 static const int kMask = ~(1 << 6); 100 static const int kMask = ~(1 << 6);
101 if (c <= kMaxOneByteChar) { 101 if (c <= kMaxOneByteChar) {
102 str[0] = c; 102 str[0] = c;
103 return 1; 103 return 1;
104 } 104 }
105 str[0] = 0xC0 | (c >> 6); 105 str[0] = 0xC0 | (c >> 6);
106 str[1] = 0x80 | (c & kMask); 106 str[1] = 0x80 | (c & kMask);
107 return 2; 107 return 2;
108 } 108 }
109 109
110 // Encode encodes the unicode code point c into the given str buffer. Unless
111 // allow_invalid is set to true, surrogate code points will be replaced with
112 // kReplacementCharacter. The caller is required to combine surrogate pairs
113 // into code points before calling Encode.
114 unsigned Utf8::Encode(char* str, uchar c, bool allow_invalid) {
115 if (!allow_invalid &&
dcarney 2014/01/07 10:12:16 move this block down into the kMaxThreeByteChar cl
116 (Utf16::IsLeadSurrogate(c) ||
117 Utf16::IsTrailSurrogate(c))) {
118 c = kReplacementCharacter;
119 }
110 120
111 unsigned Utf8::Encode(char* str, uchar c, int previous) {
112 static const int kMask = ~(1 << 6); 121 static const int kMask = ~(1 << 6);
113 if (c <= kMaxOneByteChar) { 122 if (c <= kMaxOneByteChar) {
114 str[0] = c; 123 str[0] = c;
115 return 1; 124 return 1;
116 } else if (c <= kMaxTwoByteChar) { 125 } else if (c <= kMaxTwoByteChar) {
117 str[0] = 0xC0 | (c >> 6); 126 str[0] = 0xC0 | (c >> 6);
118 str[1] = 0x80 | (c & kMask); 127 str[1] = 0x80 | (c & kMask);
119 return 2; 128 return 2;
120 } else if (c <= kMaxThreeByteChar) { 129 } else if (c <= kMaxThreeByteChar) {
121 if (Utf16::IsTrailSurrogate(c) &&
122 Utf16::IsLeadSurrogate(previous)) {
123 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
124 return Encode(str - kUnmatchedSize,
125 Utf16::CombineSurrogatePair(previous, c),
126 Utf16::kNoPreviousCharacter) - kUnmatchedSize;
127 }
128 str[0] = 0xE0 | (c >> 12); 130 str[0] = 0xE0 | (c >> 12);
129 str[1] = 0x80 | ((c >> 6) & kMask); 131 str[1] = 0x80 | ((c >> 6) & kMask);
130 str[2] = 0x80 | (c & kMask); 132 str[2] = 0x80 | (c & kMask);
131 return 3; 133 return 3;
132 } else { 134 } else {
133 str[0] = 0xF0 | (c >> 18); 135 str[0] = 0xF0 | (c >> 18);
134 str[1] = 0x80 | ((c >> 12) & kMask); 136 str[1] = 0x80 | ((c >> 12) & kMask);
135 str[2] = 0x80 | ((c >> 6) & kMask); 137 str[2] = 0x80 | ((c >> 6) & kMask);
136 str[3] = 0x80 | (c & kMask); 138 str[3] = 0x80 | (c & kMask);
137 return 4; 139 return 4;
138 } 140 }
139 } 141 }
140 142
141 143
142 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { 144 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
143 if (length <= 0) return kBadChar; 145 if (length <= 0) return kBadChar;
144 byte first = bytes[0]; 146 byte first = bytes[0];
145 // Characters between 0000 and 0007F are encoded as a single character 147 // Characters between 0000 and 0007F are encoded as a single character
146 if (first <= kMaxOneByteChar) { 148 if (first <= kMaxOneByteChar) {
147 *cursor += 1; 149 *cursor += 1;
148 return first; 150 return first;
149 } 151 }
150 return CalculateValue(bytes, length, cursor); 152 return CalculateValue(bytes, length, cursor);
151 } 153 }
152 154
155 // @TODO give this the same semantics as Encode?
dcarney 2014/01/07 11:05:50 i don't see an easy way to do this. You'd have to
153 unsigned Utf8::Length(uchar c, int previous) { 156 unsigned Utf8::Length(uchar c, int previous) {
154 if (c <= kMaxOneByteChar) { 157 if (c <= kMaxOneByteChar) {
155 return 1; 158 return 1;
156 } else if (c <= kMaxTwoByteChar) { 159 } else if (c <= kMaxTwoByteChar) {
157 return 2; 160 return 2;
158 } else if (c <= kMaxThreeByteChar) { 161 } else if (c <= kMaxThreeByteChar) {
159 if (Utf16::IsTrailSurrogate(c) && 162 if (Utf16::IsTrailSurrogate(c) &&
160 Utf16::IsLeadSurrogate(previous)) { 163 Utf16::IsLeadSurrogate(previous)) {
161 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 164 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
162 } 165 }
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
209 // Copy the rest the slow way. 212 // Copy the rest the slow way.
210 WriteUtf16Slow(unbuffered_start_, 213 WriteUtf16Slow(unbuffered_start_,
211 data + buffer_length, 214 data + buffer_length,
212 length - buffer_length); 215 length - buffer_length);
213 return length; 216 return length;
214 } 217 }
215 218
216 } // namespace unibrow 219 } // namespace unibrow
217 220
218 #endif // V8_UNICODE_INL_H_ 221 #endif // V8_UNICODE_INL_H_
OLDNEW
« src/api.cc ('K') | « src/unicode.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698