Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(74)

Side by Side Diff: third_party/protobuf/java/core/src/main/java/com/google/protobuf/Utf8.java

Issue 2600753002: Reverts third_party/protobuf: Update to HEAD (f52e188fe4) (Closed)
Patch Set: Created 3 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Protocol Buffers - Google's data interchange format 1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved. 2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/ 3 // https://developers.google.com/protocol-buffers/
4 // 4 //
5 // Redistribution and use in source and binary forms, with or without 5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are 6 // modification, are permitted provided that the following conditions are
7 // met: 7 // met:
8 // 8 //
9 // * Redistributions of source code must retain the above copyright 9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer. 10 // notice, this list of conditions and the following disclaimer.
(...skipping 12 matching lines...) Expand all
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 30
31 package com.google.protobuf; 31 package com.google.protobuf;
32 32
33 import static com.google.protobuf.UnsafeUtil.addressOffset;
34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset;
35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;
36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;
37 import static java.lang.Character.MAX_SURROGATE; 33 import static java.lang.Character.MAX_SURROGATE;
38 import static java.lang.Character.MIN_SURROGATE; 34 import static java.lang.Character.MIN_SURROGATE;
39 import static java.lang.Character.isSurrogatePair; 35 import static java.lang.Character.isSurrogatePair;
40 import static java.lang.Character.toCodePoint; 36 import static java.lang.Character.toCodePoint;
41 37
38 import java.lang.reflect.Field;
39 import java.nio.Buffer;
42 import java.nio.ByteBuffer; 40 import java.nio.ByteBuffer;
41 import java.security.AccessController;
42 import java.security.PrivilegedExceptionAction;
43 import java.util.logging.Level;
44 import java.util.logging.Logger;
43 45
44 /** 46 /**
45 * A set of low-level, high-performance static utility methods related 47 * A set of low-level, high-performance static utility methods related
46 * to the UTF-8 character encoding. This class has no dependencies 48 * to the UTF-8 character encoding. This class has no dependencies
47 * outside of the core JDK libraries. 49 * outside of the core JDK libraries.
48 * 50 *
49 * <p>There are several variants of UTF-8. The one implemented by 51 * <p>There are several variants of UTF-8. The one implemented by
50 * this class is the restricted definition of UTF-8 introduced in 52 * this class is the restricted definition of UTF-8 introduced in
51 * Unicode 3.1, which mandates the rejection of "overlong" byte 53 * Unicode 3.1, which mandates the rejection of "overlong" byte
52 * sequences as well as rejection of 3-byte surrogate codepoint byte 54 * sequences as well as rejection of 3-byte surrogate codepoint byte
(...skipping 17 matching lines...) Expand all
70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is 72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is
71 * well-formed in the absence of additional input, or if the byte sequence 73 * well-formed in the absence of additional input, or if the byte sequence
72 * apparently terminated in the middle of a character, an opaque integer 74 * apparently terminated in the middle of a character, an opaque integer
73 * "state" value containing enough information to decode the character when 75 * "state" value containing enough information to decode the character when
74 * passed to a subsequent invocation of a partial decoding method. 76 * passed to a subsequent invocation of a partial decoding method.
75 * 77 *
76 * @author martinrb@google.com (Martin Buchholz) 78 * @author martinrb@google.com (Martin Buchholz)
77 */ 79 */
78 // TODO(nathanmittler): Copy changes in this class back to Guava 80 // TODO(nathanmittler): Copy changes in this class back to Guava
79 final class Utf8 { 81 final class Utf8 {
82 private static final Logger logger = Logger.getLogger(Utf8.class.getName());
80 83
81 /** 84 /**
82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations 85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations
83 * depending on what is available on the platform. The processor is the platfo rm-optimized 86 * depending on what is available on the platform. The processor is the platfo rm-optimized
84 * delegate for which all methods are delegated directly to. 87 * delegate for which all methods are delegated directly to.
85 */ 88 */
86 private static final Processor processor = 89 private static final Processor processor =
87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( ); 90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( );
88 91
89 /** 92 /**
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
227 default: 230 default:
228 throw new AssertionError(); 231 throw new AssertionError();
229 } 232 }
230 } 233 }
231 234
232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw 235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw
233 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can 236 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can
234 // fallback to more lenient behavior. 237 // fallback to more lenient behavior.
235 238
236 static class UnpairedSurrogateException extends IllegalArgumentException { 239 static class UnpairedSurrogateException extends IllegalArgumentException {
237 UnpairedSurrogateException(int index, int length) { 240 private UnpairedSurrogateException(int index, int length) {
238 super("Unpaired surrogate at index " + index + " of " + length); 241 super("Unpaired surrogate at index " + index + " of " + length);
239 } 242 }
240 } 243 }
241 244
242 /** 245 /**
243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
245 * both time and space. 248 * both time and space.
246 * 249 *
247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired 250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired
(...skipping 733 matching lines...) Expand 10 before | Expand all | Expand 10 after
981 } 984 }
982 } 985 }
983 } 986 }
984 } 987 }
985 } 988 }
986 989
987 /** 990 /**
988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance. 991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance.
989 */ 992 */
990 static final class UnsafeProcessor extends Processor { 993 static final class UnsafeProcessor extends Processor {
994 private static final sun.misc.Unsafe UNSAFE = getUnsafe();
995 private static final long BUFFER_ADDRESS_OFFSET =
996 fieldOffset(field(Buffer.class, "address"));
997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset();
998
999 /**
1000 * We only use Unsafe operations if we have access to direct {@link ByteBuff er}'s address
1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong()) .
1002 */
1003 private static final boolean AVAILABLE =
1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0;
1005
991 /** 1006 /**
992 * Indicates whether or not all required unsafe operations are supported on this platform. 1007 * Indicates whether or not all required unsafe operations are supported on this platform.
993 */ 1008 */
994 static boolean isAvailable() { 1009 static boolean isAvailable() {
995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); 1010 return AVAILABLE;
996 } 1011 }
997 1012
998 @Override 1013 @Override
999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) { 1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) {
1000 if ((index | limit | bytes.length - limit) < 0) { 1015 if ((index | limit | bytes.length - limit) < 0) {
1001 throw new ArrayIndexOutOfBoundsException( 1016 throw new ArrayIndexOutOfBoundsException(
1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit)); 1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit));
1003 } 1018 }
1004 long offset = getArrayBaseOffset() + index; 1019 long offset = ARRAY_BASE_OFFSET + index;
1005 final long offsetLimit = getArrayBaseOffset() + limit; 1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit;
1006 if (state != COMPLETE) { 1021 if (state != COMPLETE) {
1007 // The previous decoding operation was incomplete (or malformed). 1022 // The previous decoding operation was incomplete (or malformed).
1008 // We look for a well-formed sequence consisting of bytes from 1023 // We look for a well-formed sequence consisting of bytes from
1009 // the previous decoding operation (stored in state) together 1024 // the previous decoding operation (stored in state) together
1010 // with bytes from the array slice. 1025 // with bytes from the array slice.
1011 // 1026 //
1012 // We expect such "straddler characters" to be rare. 1027 // We expect such "straddler characters" to be rare.
1013 1028
1014 if (offset >= offsetLimit) { // No bytes? No progress. 1029 if (offset >= offsetLimit) { // No bytes? No progress.
1015 return state; 1030 return state;
1016 } 1031 }
1017 int byte1 = (byte) state; 1032 int byte1 = (byte) state;
1018 // byte1 is never ASCII. 1033 // byte1 is never ASCII.
1019 if (byte1 < (byte) 0xE0) { 1034 if (byte1 < (byte) 0xE0) {
1020 // two-byte form 1035 // two-byte form
1021 1036
1022 // Simultaneously checks for illegal trailing-byte in 1037 // Simultaneously checks for illegal trailing-byte in
1023 // leading position and overlong 2-byte form. 1038 // leading position and overlong 2-byte form.
1024 if (byte1 < (byte) 0xC2 1039 if (byte1 < (byte) 0xC2
1025 // byte2 trailing-byte test 1040 // byte2 trailing-byte test
1026 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1041 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1027 return MALFORMED; 1042 return MALFORMED;
1028 } 1043 }
1029 } else if (byte1 < (byte) 0xF0) { 1044 } else if (byte1 < (byte) 0xF0) {
1030 // three-byte form 1045 // three-byte form
1031 1046
1032 // Get byte2 from saved state or array 1047 // Get byte2 from saved state or array
1033 int byte2 = (byte) ~(state >> 8); 1048 int byte2 = (byte) ~(state >> 8);
1034 if (byte2 == 0) { 1049 if (byte2 == 0) {
1035 byte2 = UnsafeUtil.getByte(bytes, offset++); 1050 byte2 = UNSAFE.getByte(bytes, offset++);
1036 if (offset >= offsetLimit) { 1051 if (offset >= offsetLimit) {
1037 return incompleteStateFor(byte1, byte2); 1052 return incompleteStateFor(byte1, byte2);
1038 } 1053 }
1039 } 1054 }
1040 if (byte2 > (byte) 0xBF 1055 if (byte2 > (byte) 0xBF
1041 // overlong? 5 most significant bits must not all be zero 1056 // overlong? 5 most significant bits must not all be zero
1042 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1057 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1043 // illegal surrogate codepoint? 1058 // illegal surrogate codepoint?
1044 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1059 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1045 // byte3 trailing-byte test 1060 // byte3 trailing-byte test
1046 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1061 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1047 return MALFORMED; 1062 return MALFORMED;
1048 } 1063 }
1049 } else { 1064 } else {
1050 // four-byte form 1065 // four-byte form
1051 1066
1052 // Get byte2 and byte3 from saved state or array 1067 // Get byte2 and byte3 from saved state or array
1053 int byte2 = (byte) ~(state >> 8); 1068 int byte2 = (byte) ~(state >> 8);
1054 int byte3 = 0; 1069 int byte3 = 0;
1055 if (byte2 == 0) { 1070 if (byte2 == 0) {
1056 byte2 = UnsafeUtil.getByte(bytes, offset++); 1071 byte2 = UNSAFE.getByte(bytes, offset++);
1057 if (offset >= offsetLimit) { 1072 if (offset >= offsetLimit) {
1058 return incompleteStateFor(byte1, byte2); 1073 return incompleteStateFor(byte1, byte2);
1059 } 1074 }
1060 } else { 1075 } else {
1061 byte3 = (byte) (state >> 16); 1076 byte3 = (byte) (state >> 16);
1062 } 1077 }
1063 if (byte3 == 0) { 1078 if (byte3 == 0) {
1064 byte3 = UnsafeUtil.getByte(bytes, offset++); 1079 byte3 = UNSAFE.getByte(bytes, offset++);
1065 if (offset >= offsetLimit) { 1080 if (offset >= offsetLimit) {
1066 return incompleteStateFor(byte1, byte2, byte3); 1081 return incompleteStateFor(byte1, byte2, byte3);
1067 } 1082 }
1068 } 1083 }
1069 1084
1070 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1085 // If we were called with state == MALFORMED, then byte1 is 0xFF,
1071 // which never occurs in well-formed UTF-8, and so we will return 1086 // which never occurs in well-formed UTF-8, and so we will return
1072 // MALFORMED again below. 1087 // MALFORMED again below.
1073 1088
1074 if (byte2 > (byte) 0xBF 1089 if (byte2 > (byte) 0xBF
1075 // Check that 1 <= plane <= 16. Tricky optimized form of: 1090 // Check that 1 <= plane <= 16. Tricky optimized form of:
1076 // if (byte1 > (byte) 0xF4 || 1091 // if (byte1 > (byte) 0xF4 ||
1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1079 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1094 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1080 // byte3 trailing-byte test 1095 // byte3 trailing-byte test
1081 || byte3 > (byte) 0xBF 1096 || byte3 > (byte) 0xBF
1082 // byte4 trailing-byte test 1097 // byte4 trailing-byte test
1083 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1098 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1084 return MALFORMED; 1099 return MALFORMED;
1085 } 1100 }
1086 } 1101 }
1087 } 1102 }
1088 1103
1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); 1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));
1090 } 1105 }
1091 1106
1092 @Override 1107 @Override
1093 int partialIsValidUtf8Direct( 1108 int partialIsValidUtf8Direct(
(...skipping 18 matching lines...) Expand all
1112 1127
1113 final int byte1 = (byte) state; 1128 final int byte1 = (byte) state;
1114 // byte1 is never ASCII. 1129 // byte1 is never ASCII.
1115 if (byte1 < (byte) 0xE0) { 1130 if (byte1 < (byte) 0xE0) {
1116 // two-byte form 1131 // two-byte form
1117 1132
1118 // Simultaneously checks for illegal trailing-byte in 1133 // Simultaneously checks for illegal trailing-byte in
1119 // leading position and overlong 2-byte form. 1134 // leading position and overlong 2-byte form.
1120 if (byte1 < (byte) 0xC2 1135 if (byte1 < (byte) 0xC2
1121 // byte2 trailing-byte test 1136 // byte2 trailing-byte test
1122 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1137 || UNSAFE.getByte(address++) > (byte) 0xBF) {
1123 return MALFORMED; 1138 return MALFORMED;
1124 } 1139 }
1125 } else if (byte1 < (byte) 0xF0) { 1140 } else if (byte1 < (byte) 0xF0) {
1126 // three-byte form 1141 // three-byte form
1127 1142
1128 // Get byte2 from saved state or array 1143 // Get byte2 from saved state or array
1129 int byte2 = (byte) ~(state >> 8); 1144 int byte2 = (byte) ~(state >> 8);
1130 if (byte2 == 0) { 1145 if (byte2 == 0) {
1131 byte2 = UnsafeUtil.getByte(address++); 1146 byte2 = UNSAFE.getByte(address++);
1132 if (address >= addressLimit) { 1147 if (address >= addressLimit) {
1133 return incompleteStateFor(byte1, byte2); 1148 return incompleteStateFor(byte1, byte2);
1134 } 1149 }
1135 } 1150 }
1136 if (byte2 > (byte) 0xBF 1151 if (byte2 > (byte) 0xBF
1137 // overlong? 5 most significant bits must not all be zero 1152 // overlong? 5 most significant bits must not all be zero
1138 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1153 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1139 // illegal surrogate codepoint? 1154 // illegal surrogate codepoint?
1140 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1155 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1141 // byte3 trailing-byte test 1156 // byte3 trailing-byte test
1142 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1157 || UNSAFE.getByte(address++) > (byte) 0xBF) {
1143 return MALFORMED; 1158 return MALFORMED;
1144 } 1159 }
1145 } else { 1160 } else {
1146 // four-byte form 1161 // four-byte form
1147 1162
1148 // Get byte2 and byte3 from saved state or array 1163 // Get byte2 and byte3 from saved state or array
1149 int byte2 = (byte) ~(state >> 8); 1164 int byte2 = (byte) ~(state >> 8);
1150 int byte3 = 0; 1165 int byte3 = 0;
1151 if (byte2 == 0) { 1166 if (byte2 == 0) {
1152 byte2 = UnsafeUtil.getByte(address++); 1167 byte2 = UNSAFE.getByte(address++);
1153 if (address >= addressLimit) { 1168 if (address >= addressLimit) {
1154 return incompleteStateFor(byte1, byte2); 1169 return incompleteStateFor(byte1, byte2);
1155 } 1170 }
1156 } else { 1171 } else {
1157 byte3 = (byte) (state >> 16); 1172 byte3 = (byte) (state >> 16);
1158 } 1173 }
1159 if (byte3 == 0) { 1174 if (byte3 == 0) {
1160 byte3 = UnsafeUtil.getByte(address++); 1175 byte3 = UNSAFE.getByte(address++);
1161 if (address >= addressLimit) { 1176 if (address >= addressLimit) {
1162 return incompleteStateFor(byte1, byte2, byte3); 1177 return incompleteStateFor(byte1, byte2, byte3);
1163 } 1178 }
1164 } 1179 }
1165 1180
1166 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1181 // If we were called with state == MALFORMED, then byte1 is 0xFF,
1167 // which never occurs in well-formed UTF-8, and so we will return 1182 // which never occurs in well-formed UTF-8, and so we will return
1168 // MALFORMED again below. 1183 // MALFORMED again below.
1169 1184
1170 if (byte2 > (byte) 0xBF 1185 if (byte2 > (byte) 0xBF
1171 // Check that 1 <= plane <= 16. Tricky optimized form of: 1186 // Check that 1 <= plane <= 16. Tricky optimized form of:
1172 // if (byte1 > (byte) 0xF4 || 1187 // if (byte1 > (byte) 0xF4 ||
1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1175 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1190 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1176 // byte3 trailing-byte test 1191 // byte3 trailing-byte test
1177 || byte3 > (byte) 0xBF 1192 || byte3 > (byte) 0xBF
1178 // byte4 trailing-byte test 1193 // byte4 trailing-byte test
1179 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1194 || UNSAFE.getByte(address++) > (byte) 0xBF) {
1180 return MALFORMED; 1195 return MALFORMED;
1181 } 1196 }
1182 } 1197 }
1183 } 1198 }
1184 1199
1185 return partialIsValidUtf8(address, (int) (addressLimit - address)); 1200 return partialIsValidUtf8(address, (int) (addressLimit - address));
1186 } 1201 }
1187 1202
1188 @Override 1203 @Override
1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) { 1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) {
1190 long outIx = getArrayBaseOffset() + offset; 1205 long outIx = ARRAY_BASE_OFFSET + offset;
1191 final long outLimit = outIx + length; 1206 final long outLimit = outIx + length;
1192 final int inLimit = in.length(); 1207 final int inLimit = in.length();
1193 if (inLimit > length || out.length - length < offset) { 1208 if (inLimit > length || out.length - length < offset) {
1194 // Not even enough room for an ASCII-encoded string. 1209 // Not even enough room for an ASCII-encoded string.
1195 throw new ArrayIndexOutOfBoundsException( 1210 throw new ArrayIndexOutOfBoundsException(
1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length)); 1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));
1197 } 1212 }
1198 1213
1199 // Designed to take advantage of 1214 // Designed to take advantage of
1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1201 int inIx = 0; 1216 int inIx = 0;
1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1203 UnsafeUtil.putByte(out, outIx++, (byte) c); 1218 UNSAFE.putByte(out, outIx++, (byte) c);
1204 } 1219 }
1205 if (inIx == inLimit) { 1220 if (inIx == inLimit) {
1206 // We're done, it was ASCII encoded. 1221 // We're done, it was ASCII encoded.
1207 return (int) (outIx - getArrayBaseOffset()); 1222 return (int) (outIx - ARRAY_BASE_OFFSET);
1208 } 1223 }
1209 1224
1210 for (char c; inIx < inLimit; ++inIx) { 1225 for (char c; inIx < inLimit; ++inIx) {
1211 c = in.charAt(inIx); 1226 c = in.charAt(inIx);
1212 if (c < 0x80 && outIx < outLimit) { 1227 if (c < 0x80 && outIx < outLimit) {
1213 UnsafeUtil.putByte(out, outIx++, (byte) c); 1228 UNSAFE.putByte(out, outIx++, (byte) c);
1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1217 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1232 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s 1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s
1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1222 } else if (outIx <= outLimit - 4L) { 1237 } else if (outIx <= outLimit - 4L) {
1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8 1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8
1224 // bytes 1239 // bytes
1225 final char low; 1240 final char low;
1226 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) { 1241 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) {
1227 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1242 throw new UnpairedSurrogateException((inIx - 1), inLimit);
1228 } 1243 }
1229 int codePoint = toCodePoint(c, low); 1244 int codePoint = toCodePoint(c, low);
1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 1 8))); 1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))) ;
1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12) )));
1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)) ));
1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); 1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint)));
1234 } else { 1249 } else {
1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1236 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) { 1251 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) {
1237 // We are surrogates and we're not a surrogate pair. 1252 // We are surrogates and we're not a surrogate pair.
1238 throw new UnpairedSurrogateException(inIx, inLimit); 1253 throw new UnpairedSurrogateException(inIx, inLimit);
1239 } 1254 }
1240 // Not enough space in the output buffer. 1255 // Not enough space in the output buffer.
1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1242 } 1257 }
1243 } 1258 }
1244 1259
1245 // All bytes have been encoded. 1260 // All bytes have been encoded.
1246 return (int) (outIx - getArrayBaseOffset()); 1261 return (int) (outIx - ARRAY_BASE_OFFSET);
1247 } 1262 }
1248 1263
1249 @Override 1264 @Override
1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1251 final long address = addressOffset(out); 1266 final long address = addressOffset(out);
1252 long outIx = address + out.position(); 1267 long outIx = address + out.position();
1253 final long outLimit = address + out.limit(); 1268 final long outLimit = address + out.limit();
1254 final int inLimit = in.length(); 1269 final int inLimit = in.length();
1255 if (inLimit > outLimit - outIx) { 1270 if (inLimit > outLimit - outIx) {
1256 // Not even enough room for an ASCII-encoded string. 1271 // Not even enough room for an ASCII-encoded string.
1257 throw new ArrayIndexOutOfBoundsException( 1272 throw new ArrayIndexOutOfBoundsException(
1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t()); 1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t());
1259 } 1274 }
1260 1275
1261 // Designed to take advantage of 1276 // Designed to take advantage of
1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1263 int inIx = 0; 1278 int inIx = 0;
1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1265 UnsafeUtil.putByte(outIx++, (byte) c); 1280 UNSAFE.putByte(outIx++, (byte) c);
1266 } 1281 }
1267 if (inIx == inLimit) { 1282 if (inIx == inLimit) {
1268 // We're done, it was ASCII encoded. 1283 // We're done, it was ASCII encoded.
1269 out.position((int) (outIx - address)); 1284 out.position((int) (outIx - address));
1270 return; 1285 return;
1271 } 1286 }
1272 1287
1273 for (char c; inIx < inLimit; ++inIx) { 1288 for (char c; inIx < inLimit; ++inIx) {
1274 c = in.charAt(inIx); 1289 c = in.charAt(inIx);
1275 if (c < 0x80 && outIx < outLimit) { 1290 if (c < 0x80 && outIx < outLimit) {
1276 UnsafeUtil.putByte(outIx++, (byte) c); 1291 UNSAFE.putByte(outIx++, (byte) c);
1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1294 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1280 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1295 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s 1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s
1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1298 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1299 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1285 } else if (outIx <= outLimit - 4L) { 1300 } else if (outIx <= outLimit - 4L) {
1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8 1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8
1287 // bytes 1302 // bytes
1288 final char low; 1303 final char low;
1289 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) { 1304 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) {
1290 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1305 throw new UnpairedSurrogateException((inIx - 1), inLimit);
1291 } 1306 }
1292 int codePoint = toCodePoint(c, low); 1307 int codePoint = toCodePoint(c, low);
1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)) )); 1309 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))) ); 1310 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); 1311 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint)));
1297 } else { 1312 } else {
1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1299 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) { 1314 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) {
1300 // We are surrogates and we're not a surrogate pair. 1315 // We are surrogates and we're not a surrogate pair.
1301 throw new UnpairedSurrogateException(inIx, inLimit); 1316 throw new UnpairedSurrogateException(inIx, inLimit);
1302 } 1317 }
1303 // Not enough space in the output buffer. 1318 // Not enough space in the output buffer.
1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1305 } 1320 }
1306 } 1321 }
(...skipping 20 matching lines...) Expand all
1327 return 0; 1342 return 0;
1328 } 1343 }
1329 1344
1330 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow. 1345 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.
1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that 1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that
1332 // the index (relative to the start of the array) is also 8-byte aligned. We do this by 1347 // the index (relative to the start of the array) is also 8-byte aligned. We do this by
1333 // ANDing the index with 7 to determine the number of bytes that need to b e read before 1348 // ANDing the index with 7 to determine the number of bytes that need to b e read before
1334 // we're 8-byte aligned. 1349 // we're 8-byte aligned.
1335 final int unaligned = (int) offset & 7; 1350 final int unaligned = (int) offset & 7;
1336 for (int j = unaligned; j > 0; j--) { 1351 for (int j = unaligned; j > 0; j--) {
1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) { 1352 if (UNSAFE.getByte(bytes, offset++) < 0) {
1338 return unaligned - j; 1353 return unaligned - j;
1339 } 1354 }
1340 } 1355 }
1341 1356
1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ). 1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).
1343 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1358 // To speed things up further, we're reading longs instead of bytes so we use a mask to
1344 // determine if any byte in the current long is non-ASCII. 1359 // determine if any byte in the current long is non-ASCII.
1345 remaining -= unaligned; 1360 remaining -= unaligned;
1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_L ONG) == 0; 1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG) == 0;
1347 offset += 8, remaining -= 8) {} 1362 offset += 8, remaining -= 8) {}
1348 return maxChars - remaining; 1363 return maxChars - remaining;
1349 } 1364 }
1350 1365
1351 /** 1366 /**
1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the 1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the
1353 * most efficient method available to the platform. 1368 * most efficient method available to the platform.
1354 */ 1369 */
1355 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) { 1370 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) {
1356 int remaining = maxChars; 1371 int remaining = maxChars;
1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1358 // Don't bother with small strings. 1373 // Don't bother with small strings.
1359 return 0; 1374 return 0;
1360 } 1375 }
1361 1376
1362 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow. 1377 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.
1363 // We do this by ANDing the address with 7 to determine the number of byte s that need to 1378 // We do this by ANDing the address with 7 to determine the number of byte s that need to
1364 // be read before we're 8-byte aligned. 1379 // be read before we're 8-byte aligned.
1365 final int unaligned = (int) address & 7; 1380 final int unaligned = (int) address & 7;
1366 for (int j = unaligned; j > 0; j--) { 1381 for (int j = unaligned; j > 0; j--) {
1367 if (UnsafeUtil.getByte(address++) < 0) { 1382 if (UNSAFE.getByte(address++) < 0) {
1368 return unaligned - j; 1383 return unaligned - j;
1369 } 1384 }
1370 } 1385 }
1371 1386
1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ). 1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).
1373 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1388 // To speed things up further, we're reading longs instead of bytes so we use a mask to
1374 // determine if any byte in the current long is non-ASCII. 1389 // determine if any byte in the current long is non-ASCII.
1375 remaining -= unaligned; 1390 remaining -= unaligned;
1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) = = 0; 1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0;
1377 address += 8, remaining -= 8) {} 1392 address += 8, remaining -= 8) {}
1378 return maxChars - remaining; 1393 return maxChars - remaining;
1379 } 1394 }
1380 1395
1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) { 1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) {
1382 // Skip past ASCII characters as quickly as possible. 1397 // Skip past ASCII characters as quickly as possible.
1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g); 1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g);
1384 remaining -= skipped; 1399 remaining -= skipped;
1385 offset += skipped; 1400 offset += skipped;
1386 1401
1387 for (;;) { 1402 for (;;) {
1388 // Optimize for interior runs of ASCII bytes. 1403 // Optimize for interior runs of ASCII bytes.
1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold? 1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?
1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e? 1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?
1391 int byte1 = 0; 1406 int byte1 = 0;
1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) { 1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0; --remaining) {
1393 } 1408 }
1394 if (remaining == 0) { 1409 if (remaining == 0) {
1395 return COMPLETE; 1410 return COMPLETE;
1396 } 1411 }
1397 remaining--; 1412 remaining--;
1398 1413
1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
1400 if (byte1 < (byte) 0xE0) { 1415 if (byte1 < (byte) 0xE0) {
1401 // Two-byte form (110xxxxx 10xxxxxx) 1416 // Two-byte form (110xxxxx 10xxxxxx)
1402 if (remaining == 0) { 1417 if (remaining == 0) {
1403 // Incomplete sequence 1418 // Incomplete sequence
1404 return byte1; 1419 return byte1;
1405 } 1420 }
1406 remaining--; 1421 remaining--;
1407 1422
1408 // Simultaneously checks for illegal trailing-byte in 1423 // Simultaneously checks for illegal trailing-byte in
1409 // leading position and overlong 2-byte form. 1424 // leading position and overlong 2-byte form.
1410 if (byte1 < (byte) 0xC2 1425 if (byte1 < (byte) 0xC2
1411 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1426 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1412 return MALFORMED; 1427 return MALFORMED;
1413 } 1428 }
1414 } else if (byte1 < (byte) 0xF0) { 1429 } else if (byte1 < (byte) 0xF0) {
1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
1416 if (remaining < 2) { 1431 if (remaining < 2) {
1417 // Incomplete sequence 1432 // Incomplete sequence
1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1419 } 1434 }
1420 remaining -= 2; 1435 remaining -= 2;
1421 1436
1422 final int byte2; 1437 final int byte2;
1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF
1424 // overlong? 5 most significant bits must not all be zero 1439 // overlong? 5 most significant bits must not all be zero
1425 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1440 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1426 // check for illegal surrogate codepoints 1441 // check for illegal surrogate codepoints
1427 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1442 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1428 // byte3 trailing-byte test 1443 // byte3 trailing-byte test
1429 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1444 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1430 return MALFORMED; 1445 return MALFORMED;
1431 } 1446 }
1432 } else { 1447 } else {
1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
1434 if (remaining < 3) { 1449 if (remaining < 3) {
1435 // Incomplete sequence 1450 // Incomplete sequence
1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1437 } 1452 }
1438 remaining -= 3; 1453 remaining -= 3;
1439 1454
1440 final int byte2; 1455 final int byte2;
1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF
1442 // Check that 1 <= plane <= 16. Tricky optimized form of: 1457 // Check that 1 <= plane <= 16. Tricky optimized form of:
1443 // if (byte1 > (byte) 0xF4 || 1458 // if (byte1 > (byte) 0xF4 ||
1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1446 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1461 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1447 // byte3 trailing-byte test 1462 // byte3 trailing-byte test
1448 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF 1463 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF
1449 // byte4 trailing-byte test 1464 // byte4 trailing-byte test
1450 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1465 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1451 return MALFORMED; 1466 return MALFORMED;
1452 } 1467 }
1453 } 1468 }
1454 } 1469 }
1455 } 1470 }
1456 1471
1457 private static int partialIsValidUtf8(long address, int remaining) { 1472 private static int partialIsValidUtf8(long address, int remaining) {
1458 // Skip past ASCII characters as quickly as possible. 1473 // Skip past ASCII characters as quickly as possible.
1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); 1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);
1460 address += skipped; 1475 address += skipped;
1461 remaining -= skipped; 1476 remaining -= skipped;
1462 1477
1463 for (;;) { 1478 for (;;) {
1464 // Optimize for interior runs of ASCII bytes. 1479 // Optimize for interior runs of ASCII bytes.
1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold? 1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?
1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e? 1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?
1467 int byte1 = 0; 1482 int byte1 = 0;
1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; -- remaining) { 1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --rema ining) {
1469 } 1484 }
1470 if (remaining == 0) { 1485 if (remaining == 0) {
1471 return COMPLETE; 1486 return COMPLETE;
1472 } 1487 }
1473 remaining--; 1488 remaining--;
1474 1489
1475 if (byte1 < (byte) 0xE0) { 1490 if (byte1 < (byte) 0xE0) {
1476 // Two-byte form 1491 // Two-byte form
1477 1492
1478 if (remaining == 0) { 1493 if (remaining == 0) {
1479 // Incomplete sequence 1494 // Incomplete sequence
1480 return byte1; 1495 return byte1;
1481 } 1496 }
1482 remaining--; 1497 remaining--;
1483 1498
1484 // Simultaneously checks for illegal trailing-byte in 1499 // Simultaneously checks for illegal trailing-byte in
1485 // leading position and overlong 2-byte form. 1500 // leading position and overlong 2-byte form.
1486 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF ) { 1501 if (byte1 < (byte) 0xC2 || UNSAFE.getByte(address++) > (byte) 0xBF) {
1487 return MALFORMED; 1502 return MALFORMED;
1488 } 1503 }
1489 } else if (byte1 < (byte) 0xF0) { 1504 } else if (byte1 < (byte) 0xF0) {
1490 // Three-byte form 1505 // Three-byte form
1491 1506
1492 if (remaining < 2) { 1507 if (remaining < 2) {
1493 // Incomplete sequence 1508 // Incomplete sequence
1494 return unsafeIncompleteStateFor(address, byte1, remaining); 1509 return unsafeIncompleteStateFor(address, byte1, remaining);
1495 } 1510 }
1496 remaining -= 2; 1511 remaining -= 2;
1497 1512
1498 final byte byte2 = UnsafeUtil.getByte(address++); 1513 final byte byte2 = UNSAFE.getByte(address++);
1499 if (byte2 > (byte) 0xBF 1514 if (byte2 > (byte) 0xBF
1500 // overlong? 5 most significant bits must not all be zero 1515 // overlong? 5 most significant bits must not all be zero
1501 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1516 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1502 // check for illegal surrogate codepoints 1517 // check for illegal surrogate codepoints
1503 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1518 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1504 // byte3 trailing-byte test 1519 // byte3 trailing-byte test
1505 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1520 || UNSAFE.getByte(address++) > (byte) 0xBF) {
1506 return MALFORMED; 1521 return MALFORMED;
1507 } 1522 }
1508 } else { 1523 } else {
1509 // Four-byte form 1524 // Four-byte form
1510 1525
1511 if (remaining < 3) { 1526 if (remaining < 3) {
1512 // Incomplete sequence 1527 // Incomplete sequence
1513 return unsafeIncompleteStateFor(address, byte1, remaining); 1528 return unsafeIncompleteStateFor(address, byte1, remaining);
1514 } 1529 }
1515 remaining -= 3; 1530 remaining -= 3;
1516 1531
1517 final byte byte2 = UnsafeUtil.getByte(address++); 1532 final byte byte2 = UNSAFE.getByte(address++);
1518 if (byte2 > (byte) 0xBF 1533 if (byte2 > (byte) 0xBF
1519 // Check that 1 <= plane <= 16. Tricky optimized form of: 1534 // Check that 1 <= plane <= 16. Tricky optimized form of:
1520 // if (byte1 > (byte) 0xF4 || 1535 // if (byte1 > (byte) 0xF4 ||
1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1523 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1538 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1524 // byte3 trailing-byte test 1539 // byte3 trailing-byte test
1525 || UnsafeUtil.getByte(address++) > (byte) 0xBF 1540 || UNSAFE.getByte(address++) > (byte) 0xBF
1526 // byte4 trailing-byte test 1541 // byte4 trailing-byte test
1527 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1542 || UNSAFE.getByte(address++) > (byte) 0xBF) {
1528 return MALFORMED; 1543 return MALFORMED;
1529 } 1544 }
1530 } 1545 }
1531 } 1546 }
1532 } 1547 }
1533 1548
1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset, 1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset,
1535 int remaining) { 1550 int remaining) {
1536 switch (remaining) { 1551 switch (remaining) {
1537 case 0: { 1552 case 0: {
1538 return incompleteStateFor(byte1); 1553 return incompleteStateFor(byte1);
1539 } 1554 }
1540 case 1: { 1555 case 1: {
1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); 1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset));
1542 } 1557 }
1543 case 2: { 1558 case 2: {
1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset), 1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset),
1545 UnsafeUtil.getByte(bytes, offset + 1)); 1560 UNSAFE.getByte(bytes, offset + 1));
1546 } 1561 }
1547 default: { 1562 default: {
1548 throw new AssertionError(); 1563 throw new AssertionError();
1549 } 1564 }
1550 } 1565 }
1551 } 1566 }
1552 1567
1553 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) { 1568 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) {
1554 switch (remaining) { 1569 switch (remaining) {
1555 case 0: { 1570 case 0: {
1556 return incompleteStateFor(byte1); 1571 return incompleteStateFor(byte1);
1557 } 1572 }
1558 case 1: { 1573 case 1: {
1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); 1574 return incompleteStateFor(byte1, UNSAFE.getByte(address));
1560 } 1575 }
1561 case 2: { 1576 case 2: {
1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address), 1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getBy te(address + 1));
1563 UnsafeUtil.getByte(address + 1));
1564 } 1578 }
1565 default: { 1579 default: {
1566 throw new AssertionError(); 1580 throw new AssertionError();
1567 } 1581 }
1568 } 1582 }
1569 } 1583 }
1584
1585 /**
1586 * Gets the field with the given name within the class, or {@code null} if n ot found. If
1587 * found, the field is made accessible.
1588 */
1589 private static Field field(Class<?> clazz, String fieldName) {
1590 Field field;
1591 try {
1592 field = clazz.getDeclaredField(fieldName);
1593 field.setAccessible(true);
1594 } catch (Throwable t) {
1595 // Failed to access the fields.
1596 field = null;
1597 }
1598 logger.log(Level.FINEST, "{0}.{1}: {2}",
1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available" : "unavailable")});
1600 return field;
1601 }
1602
1603 /**
1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.mis c.Unsafe} is not
1605 * available.
1606 */
1607 private static long fieldOffset(Field field) {
1608 return field == null || UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(fie ld);
1609 }
1610
1611 /**
1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Uns afe} is not
1613 * available.
1614 */
1615 private static <T> int byteArrayBaseOffset() {
1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class);
1617 }
1618
1619 /**
1620 * Gets the offset of the {@code address} field of the given direct {@link B yteBuffer}.
1621 */
1622 private static long addressOffset(ByteBuffer buffer) {
1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET);
1624 }
1625
1626 /**
1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not availab le on this
1628 * platform.
1629 */
1630 private static sun.misc.Unsafe getUnsafe() {
1631 sun.misc.Unsafe unsafe = null;
1632 try {
1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun .misc.Unsafe>() {
1634 @Override
1635 public sun.misc.Unsafe run() throws Exception {
1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class;
1637
1638 // Check that this platform supports all of the required unsafe meth ods.
1639 checkRequiredMethods(k);
1640
1641 for (Field f : k.getDeclaredFields()) {
1642 f.setAccessible(true);
1643 Object x = f.get(null);
1644 if (k.isInstance(x)) {
1645 return k.cast(x);
1646 }
1647 }
1648 // The sun.misc.Unsafe field does not exist.
1649 return null;
1650 }
1651 });
1652 } catch (Throwable e) {
1653 // Catching Throwable here due to the fact that Google AppEngine raises NoClassDefFoundError
1654 // for Unsafe.
1655 }
1656
1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}",
1658 unsafe != null ? "available" : "unavailable");
1659 return unsafe;
1660 }
1661
1662 /**
1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are availab le on this platform.
1664 */
1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz)
1666 throws NoSuchMethodException, SecurityException {
1667 // Needed for Unsafe byte[] access
1668 clazz.getMethod("arrayBaseOffset", Class.class);
1669 clazz.getMethod("getByte", Object.class, long.class);
1670 clazz.getMethod("putByte", Object.class, long.class, byte.class);
1671 clazz.getMethod("getLong", Object.class, long.class);
1672
1673 // Needed for Unsafe Direct ByteBuffer access
1674 clazz.getMethod("objectFieldOffset", Field.class);
1675 clazz.getMethod("getByte", long.class);
1676 clazz.getMethod("getLong", Object.class, long.class);
1677 clazz.getMethod("putByte", long.class, byte.class);
1678 clazz.getMethod("getLong", long.class);
1679 }
1570 } 1680 }
1571 1681
1572 private Utf8() {} 1682 private Utf8() {}
1573 } 1683 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698