Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1201)

Side by Side Diff: third_party/protobuf/java/core/src/main/java/com/google/protobuf/Utf8.java

Issue 2495533002: third_party/protobuf: Update to HEAD (83d681ee2c) (Closed)
Patch Set: Make chrome settings proto generated file a component Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Protocol Buffers - Google's data interchange format 1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved. 2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/ 3 // https://developers.google.com/protocol-buffers/
4 // 4 //
5 // Redistribution and use in source and binary forms, with or without 5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are 6 // modification, are permitted provided that the following conditions are
7 // met: 7 // met:
8 // 8 //
9 // * Redistributions of source code must retain the above copyright 9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer. 10 // notice, this list of conditions and the following disclaimer.
(...skipping 12 matching lines...) Expand all
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 30
31 package com.google.protobuf; 31 package com.google.protobuf;
32 32
33 import static com.google.protobuf.UnsafeUtil.addressOffset;
34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset;
35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;
36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;
33 import static java.lang.Character.MAX_SURROGATE; 37 import static java.lang.Character.MAX_SURROGATE;
34 import static java.lang.Character.MIN_SURROGATE; 38 import static java.lang.Character.MIN_SURROGATE;
35 import static java.lang.Character.isSurrogatePair; 39 import static java.lang.Character.isSurrogatePair;
36 import static java.lang.Character.toCodePoint; 40 import static java.lang.Character.toCodePoint;
37 41
38 import java.lang.reflect.Field;
39 import java.nio.Buffer;
40 import java.nio.ByteBuffer; 42 import java.nio.ByteBuffer;
41 import java.security.AccessController;
42 import java.security.PrivilegedExceptionAction;
43 import java.util.logging.Level;
44 import java.util.logging.Logger;
45 43
46 /** 44 /**
47 * A set of low-level, high-performance static utility methods related 45 * A set of low-level, high-performance static utility methods related
48 * to the UTF-8 character encoding. This class has no dependencies 46 * to the UTF-8 character encoding. This class has no dependencies
49 * outside of the core JDK libraries. 47 * outside of the core JDK libraries.
50 * 48 *
51 * <p>There are several variants of UTF-8. The one implemented by 49 * <p>There are several variants of UTF-8. The one implemented by
52 * this class is the restricted definition of UTF-8 introduced in 50 * this class is the restricted definition of UTF-8 introduced in
53 * Unicode 3.1, which mandates the rejection of "overlong" byte 51 * Unicode 3.1, which mandates the rejection of "overlong" byte
54 * sequences as well as rejection of 3-byte surrogate codepoint byte 52 * sequences as well as rejection of 3-byte surrogate codepoint byte
(...skipping 17 matching lines...) Expand all
72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is 70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is
73 * well-formed in the absence of additional input, or if the byte sequence 71 * well-formed in the absence of additional input, or if the byte sequence
74 * apparently terminated in the middle of a character, an opaque integer 72 * apparently terminated in the middle of a character, an opaque integer
75 * "state" value containing enough information to decode the character when 73 * "state" value containing enough information to decode the character when
76 * passed to a subsequent invocation of a partial decoding method. 74 * passed to a subsequent invocation of a partial decoding method.
77 * 75 *
78 * @author martinrb@google.com (Martin Buchholz) 76 * @author martinrb@google.com (Martin Buchholz)
79 */ 77 */
80 // TODO(nathanmittler): Copy changes in this class back to Guava 78 // TODO(nathanmittler): Copy changes in this class back to Guava
81 final class Utf8 { 79 final class Utf8 {
82 private static final Logger logger = Logger.getLogger(Utf8.class.getName());
83 80
84 /** 81 /**
85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations 82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations
86 * depending on what is available on the platform. The processor is the platfo rm-optimized 83 * depending on what is available on the platform. The processor is the platfo rm-optimized
87 * delegate for which all methods are delegated directly to. 84 * delegate for which all methods are delegated directly to.
88 */ 85 */
89 private static final Processor processor = 86 private static final Processor processor =
90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( ); 87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( );
91 88
92 /** 89 /**
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
230 default: 227 default:
231 throw new AssertionError(); 228 throw new AssertionError();
232 } 229 }
233 } 230 }
234 231
235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw 232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw
236 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can 233 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can
237 // fallback to more lenient behavior. 234 // fallback to more lenient behavior.
238 235
239 static class UnpairedSurrogateException extends IllegalArgumentException { 236 static class UnpairedSurrogateException extends IllegalArgumentException {
240 private UnpairedSurrogateException(int index, int length) { 237 UnpairedSurrogateException(int index, int length) {
241 super("Unpaired surrogate at index " + index + " of " + length); 238 super("Unpaired surrogate at index " + index + " of " + length);
242 } 239 }
243 } 240 }
244 241
245 /** 242 /**
246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
248 * both time and space. 245 * both time and space.
249 * 246 *
250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired 247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired
(...skipping 733 matching lines...) Expand 10 before | Expand all | Expand 10 after
984 } 981 }
985 } 982 }
986 } 983 }
987 } 984 }
988 } 985 }
989 986
990 /** 987 /**
991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance. 988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance.
992 */ 989 */
993 static final class UnsafeProcessor extends Processor { 990 static final class UnsafeProcessor extends Processor {
994 private static final sun.misc.Unsafe UNSAFE = getUnsafe();
995 private static final long BUFFER_ADDRESS_OFFSET =
996 fieldOffset(field(Buffer.class, "address"));
997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset();
998
999 /**
1000 * We only use Unsafe operations if we have access to direct {@link ByteBuff er}'s address
1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong()) .
1002 */
1003 private static final boolean AVAILABLE =
1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0;
1005
1006 /** 991 /**
1007 * Indicates whether or not all required unsafe operations are supported on this platform. 992 * Indicates whether or not all required unsafe operations are supported on this platform.
1008 */ 993 */
1009 static boolean isAvailable() { 994 static boolean isAvailable() {
1010 return AVAILABLE; 995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
1011 } 996 }
1012 997
1013 @Override 998 @Override
1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) { 999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) {
1015 if ((index | limit | bytes.length - limit) < 0) { 1000 if ((index | limit | bytes.length - limit) < 0) {
1016 throw new ArrayIndexOutOfBoundsException( 1001 throw new ArrayIndexOutOfBoundsException(
1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit)); 1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit));
1018 } 1003 }
1019 long offset = ARRAY_BASE_OFFSET + index; 1004 long offset = getArrayBaseOffset() + index;
1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit; 1005 final long offsetLimit = getArrayBaseOffset() + limit;
1021 if (state != COMPLETE) { 1006 if (state != COMPLETE) {
1022 // The previous decoding operation was incomplete (or malformed). 1007 // The previous decoding operation was incomplete (or malformed).
1023 // We look for a well-formed sequence consisting of bytes from 1008 // We look for a well-formed sequence consisting of bytes from
1024 // the previous decoding operation (stored in state) together 1009 // the previous decoding operation (stored in state) together
1025 // with bytes from the array slice. 1010 // with bytes from the array slice.
1026 // 1011 //
1027 // We expect such "straddler characters" to be rare. 1012 // We expect such "straddler characters" to be rare.
1028 1013
1029 if (offset >= offsetLimit) { // No bytes? No progress. 1014 if (offset >= offsetLimit) { // No bytes? No progress.
1030 return state; 1015 return state;
1031 } 1016 }
1032 int byte1 = (byte) state; 1017 int byte1 = (byte) state;
1033 // byte1 is never ASCII. 1018 // byte1 is never ASCII.
1034 if (byte1 < (byte) 0xE0) { 1019 if (byte1 < (byte) 0xE0) {
1035 // two-byte form 1020 // two-byte form
1036 1021
1037 // Simultaneously checks for illegal trailing-byte in 1022 // Simultaneously checks for illegal trailing-byte in
1038 // leading position and overlong 2-byte form. 1023 // leading position and overlong 2-byte form.
1039 if (byte1 < (byte) 0xC2 1024 if (byte1 < (byte) 0xC2
1040 // byte2 trailing-byte test 1025 // byte2 trailing-byte test
1041 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1026 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1042 return MALFORMED; 1027 return MALFORMED;
1043 } 1028 }
1044 } else if (byte1 < (byte) 0xF0) { 1029 } else if (byte1 < (byte) 0xF0) {
1045 // three-byte form 1030 // three-byte form
1046 1031
1047 // Get byte2 from saved state or array 1032 // Get byte2 from saved state or array
1048 int byte2 = (byte) ~(state >> 8); 1033 int byte2 = (byte) ~(state >> 8);
1049 if (byte2 == 0) { 1034 if (byte2 == 0) {
1050 byte2 = UNSAFE.getByte(bytes, offset++); 1035 byte2 = UnsafeUtil.getByte(bytes, offset++);
1051 if (offset >= offsetLimit) { 1036 if (offset >= offsetLimit) {
1052 return incompleteStateFor(byte1, byte2); 1037 return incompleteStateFor(byte1, byte2);
1053 } 1038 }
1054 } 1039 }
1055 if (byte2 > (byte) 0xBF 1040 if (byte2 > (byte) 0xBF
1056 // overlong? 5 most significant bits must not all be zero 1041 // overlong? 5 most significant bits must not all be zero
1057 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1042 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1058 // illegal surrogate codepoint? 1043 // illegal surrogate codepoint?
1059 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1044 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1060 // byte3 trailing-byte test 1045 // byte3 trailing-byte test
1061 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1046 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1062 return MALFORMED; 1047 return MALFORMED;
1063 } 1048 }
1064 } else { 1049 } else {
1065 // four-byte form 1050 // four-byte form
1066 1051
1067 // Get byte2 and byte3 from saved state or array 1052 // Get byte2 and byte3 from saved state or array
1068 int byte2 = (byte) ~(state >> 8); 1053 int byte2 = (byte) ~(state >> 8);
1069 int byte3 = 0; 1054 int byte3 = 0;
1070 if (byte2 == 0) { 1055 if (byte2 == 0) {
1071 byte2 = UNSAFE.getByte(bytes, offset++); 1056 byte2 = UnsafeUtil.getByte(bytes, offset++);
1072 if (offset >= offsetLimit) { 1057 if (offset >= offsetLimit) {
1073 return incompleteStateFor(byte1, byte2); 1058 return incompleteStateFor(byte1, byte2);
1074 } 1059 }
1075 } else { 1060 } else {
1076 byte3 = (byte) (state >> 16); 1061 byte3 = (byte) (state >> 16);
1077 } 1062 }
1078 if (byte3 == 0) { 1063 if (byte3 == 0) {
1079 byte3 = UNSAFE.getByte(bytes, offset++); 1064 byte3 = UnsafeUtil.getByte(bytes, offset++);
1080 if (offset >= offsetLimit) { 1065 if (offset >= offsetLimit) {
1081 return incompleteStateFor(byte1, byte2, byte3); 1066 return incompleteStateFor(byte1, byte2, byte3);
1082 } 1067 }
1083 } 1068 }
1084 1069
1085 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1070 // If we were called with state == MALFORMED, then byte1 is 0xFF,
1086 // which never occurs in well-formed UTF-8, and so we will return 1071 // which never occurs in well-formed UTF-8, and so we will return
1087 // MALFORMED again below. 1072 // MALFORMED again below.
1088 1073
1089 if (byte2 > (byte) 0xBF 1074 if (byte2 > (byte) 0xBF
1090 // Check that 1 <= plane <= 16. Tricky optimized form of: 1075 // Check that 1 <= plane <= 16. Tricky optimized form of:
1091 // if (byte1 > (byte) 0xF4 || 1076 // if (byte1 > (byte) 0xF4 ||
1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1094 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1079 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1095 // byte3 trailing-byte test 1080 // byte3 trailing-byte test
1096 || byte3 > (byte) 0xBF 1081 || byte3 > (byte) 0xBF
1097 // byte4 trailing-byte test 1082 // byte4 trailing-byte test
1098 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1083 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1099 return MALFORMED; 1084 return MALFORMED;
1100 } 1085 }
1101 } 1086 }
1102 } 1087 }
1103 1088
1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); 1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));
1105 } 1090 }
1106 1091
1107 @Override 1092 @Override
1108 int partialIsValidUtf8Direct( 1093 int partialIsValidUtf8Direct(
(...skipping 18 matching lines...) Expand all
1127 1112
1128 final int byte1 = (byte) state; 1113 final int byte1 = (byte) state;
1129 // byte1 is never ASCII. 1114 // byte1 is never ASCII.
1130 if (byte1 < (byte) 0xE0) { 1115 if (byte1 < (byte) 0xE0) {
1131 // two-byte form 1116 // two-byte form
1132 1117
1133 // Simultaneously checks for illegal trailing-byte in 1118 // Simultaneously checks for illegal trailing-byte in
1134 // leading position and overlong 2-byte form. 1119 // leading position and overlong 2-byte form.
1135 if (byte1 < (byte) 0xC2 1120 if (byte1 < (byte) 0xC2
1136 // byte2 trailing-byte test 1121 // byte2 trailing-byte test
1137 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1122 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1138 return MALFORMED; 1123 return MALFORMED;
1139 } 1124 }
1140 } else if (byte1 < (byte) 0xF0) { 1125 } else if (byte1 < (byte) 0xF0) {
1141 // three-byte form 1126 // three-byte form
1142 1127
1143 // Get byte2 from saved state or array 1128 // Get byte2 from saved state or array
1144 int byte2 = (byte) ~(state >> 8); 1129 int byte2 = (byte) ~(state >> 8);
1145 if (byte2 == 0) { 1130 if (byte2 == 0) {
1146 byte2 = UNSAFE.getByte(address++); 1131 byte2 = UnsafeUtil.getByte(address++);
1147 if (address >= addressLimit) { 1132 if (address >= addressLimit) {
1148 return incompleteStateFor(byte1, byte2); 1133 return incompleteStateFor(byte1, byte2);
1149 } 1134 }
1150 } 1135 }
1151 if (byte2 > (byte) 0xBF 1136 if (byte2 > (byte) 0xBF
1152 // overlong? 5 most significant bits must not all be zero 1137 // overlong? 5 most significant bits must not all be zero
1153 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1138 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1154 // illegal surrogate codepoint? 1139 // illegal surrogate codepoint?
1155 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1140 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1156 // byte3 trailing-byte test 1141 // byte3 trailing-byte test
1157 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1142 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1158 return MALFORMED; 1143 return MALFORMED;
1159 } 1144 }
1160 } else { 1145 } else {
1161 // four-byte form 1146 // four-byte form
1162 1147
1163 // Get byte2 and byte3 from saved state or array 1148 // Get byte2 and byte3 from saved state or array
1164 int byte2 = (byte) ~(state >> 8); 1149 int byte2 = (byte) ~(state >> 8);
1165 int byte3 = 0; 1150 int byte3 = 0;
1166 if (byte2 == 0) { 1151 if (byte2 == 0) {
1167 byte2 = UNSAFE.getByte(address++); 1152 byte2 = UnsafeUtil.getByte(address++);
1168 if (address >= addressLimit) { 1153 if (address >= addressLimit) {
1169 return incompleteStateFor(byte1, byte2); 1154 return incompleteStateFor(byte1, byte2);
1170 } 1155 }
1171 } else { 1156 } else {
1172 byte3 = (byte) (state >> 16); 1157 byte3 = (byte) (state >> 16);
1173 } 1158 }
1174 if (byte3 == 0) { 1159 if (byte3 == 0) {
1175 byte3 = UNSAFE.getByte(address++); 1160 byte3 = UnsafeUtil.getByte(address++);
1176 if (address >= addressLimit) { 1161 if (address >= addressLimit) {
1177 return incompleteStateFor(byte1, byte2, byte3); 1162 return incompleteStateFor(byte1, byte2, byte3);
1178 } 1163 }
1179 } 1164 }
1180 1165
1181 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1166 // If we were called with state == MALFORMED, then byte1 is 0xFF,
1182 // which never occurs in well-formed UTF-8, and so we will return 1167 // which never occurs in well-formed UTF-8, and so we will return
1183 // MALFORMED again below. 1168 // MALFORMED again below.
1184 1169
1185 if (byte2 > (byte) 0xBF 1170 if (byte2 > (byte) 0xBF
1186 // Check that 1 <= plane <= 16. Tricky optimized form of: 1171 // Check that 1 <= plane <= 16. Tricky optimized form of:
1187 // if (byte1 > (byte) 0xF4 || 1172 // if (byte1 > (byte) 0xF4 ||
1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1190 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1175 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1191 // byte3 trailing-byte test 1176 // byte3 trailing-byte test
1192 || byte3 > (byte) 0xBF 1177 || byte3 > (byte) 0xBF
1193 // byte4 trailing-byte test 1178 // byte4 trailing-byte test
1194 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1179 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1195 return MALFORMED; 1180 return MALFORMED;
1196 } 1181 }
1197 } 1182 }
1198 } 1183 }
1199 1184
1200 return partialIsValidUtf8(address, (int) (addressLimit - address)); 1185 return partialIsValidUtf8(address, (int) (addressLimit - address));
1201 } 1186 }
1202 1187
1203 @Override 1188 @Override
1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) { 1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) {
1205 long outIx = ARRAY_BASE_OFFSET + offset; 1190 long outIx = getArrayBaseOffset() + offset;
1206 final long outLimit = outIx + length; 1191 final long outLimit = outIx + length;
1207 final int inLimit = in.length(); 1192 final int inLimit = in.length();
1208 if (inLimit > length || out.length - length < offset) { 1193 if (inLimit > length || out.length - length < offset) {
1209 // Not even enough room for an ASCII-encoded string. 1194 // Not even enough room for an ASCII-encoded string.
1210 throw new ArrayIndexOutOfBoundsException( 1195 throw new ArrayIndexOutOfBoundsException(
1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length)); 1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));
1212 } 1197 }
1213 1198
1214 // Designed to take advantage of 1199 // Designed to take advantage of
1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1216 int inIx = 0; 1201 int inIx = 0;
1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1218 UNSAFE.putByte(out, outIx++, (byte) c); 1203 UnsafeUtil.putByte(out, outIx++, (byte) c);
1219 } 1204 }
1220 if (inIx == inLimit) { 1205 if (inIx == inLimit) {
1221 // We're done, it was ASCII encoded. 1206 // We're done, it was ASCII encoded.
1222 return (int) (outIx - ARRAY_BASE_OFFSET); 1207 return (int) (outIx - getArrayBaseOffset());
1223 } 1208 }
1224 1209
1225 for (char c; inIx < inLimit; ++inIx) { 1210 for (char c; inIx < inLimit; ++inIx) {
1226 c = in.charAt(inIx); 1211 c = in.charAt(inIx);
1227 if (c < 0x80 && outIx < outLimit) { 1212 if (c < 0x80 && outIx < outLimit) {
1228 UNSAFE.putByte(out, outIx++, (byte) c); 1213 UnsafeUtil.putByte(out, outIx++, (byte) c);
1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1232 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1217 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s 1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s
1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1237 } else if (outIx <= outLimit - 4L) { 1222 } else if (outIx <= outLimit - 4L) {
1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8 1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8
1239 // bytes 1224 // bytes
1240 final char low; 1225 final char low;
1241 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) { 1226 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) {
1242 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1227 throw new UnpairedSurrogateException((inIx - 1), inLimit);
1243 } 1228 }
1244 int codePoint = toCodePoint(c, low); 1229 int codePoint = toCodePoint(c, low);
1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))) ; 1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 1 8)));
1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12) ))); 1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)) )); 1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); 1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint)));
1249 } else { 1234 } else {
1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1251 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) { 1236 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) {
1252 // We are surrogates and we're not a surrogate pair. 1237 // We are surrogates and we're not a surrogate pair.
1253 throw new UnpairedSurrogateException(inIx, inLimit); 1238 throw new UnpairedSurrogateException(inIx, inLimit);
1254 } 1239 }
1255 // Not enough space in the output buffer. 1240 // Not enough space in the output buffer.
1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1257 } 1242 }
1258 } 1243 }
1259 1244
1260 // All bytes have been encoded. 1245 // All bytes have been encoded.
1261 return (int) (outIx - ARRAY_BASE_OFFSET); 1246 return (int) (outIx - getArrayBaseOffset());
1262 } 1247 }
1263 1248
1264 @Override 1249 @Override
1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1266 final long address = addressOffset(out); 1251 final long address = addressOffset(out);
1267 long outIx = address + out.position(); 1252 long outIx = address + out.position();
1268 final long outLimit = address + out.limit(); 1253 final long outLimit = address + out.limit();
1269 final int inLimit = in.length(); 1254 final int inLimit = in.length();
1270 if (inLimit > outLimit - outIx) { 1255 if (inLimit > outLimit - outIx) {
1271 // Not even enough room for an ASCII-encoded string. 1256 // Not even enough room for an ASCII-encoded string.
1272 throw new ArrayIndexOutOfBoundsException( 1257 throw new ArrayIndexOutOfBoundsException(
1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t()); 1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t());
1274 } 1259 }
1275 1260
1276 // Designed to take advantage of 1261 // Designed to take advantage of
1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1278 int inIx = 0; 1263 int inIx = 0;
1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1280 UNSAFE.putByte(outIx++, (byte) c); 1265 UnsafeUtil.putByte(outIx++, (byte) c);
1281 } 1266 }
1282 if (inIx == inLimit) { 1267 if (inIx == inLimit) {
1283 // We're done, it was ASCII encoded. 1268 // We're done, it was ASCII encoded.
1284 out.position((int) (outIx - address)); 1269 out.position((int) (outIx - address));
1285 return; 1270 return;
1286 } 1271 }
1287 1272
1288 for (char c; inIx < inLimit; ++inIx) { 1273 for (char c; inIx < inLimit; ++inIx) {
1289 c = in.charAt(inIx); 1274 c = in.charAt(inIx);
1290 if (c < 0x80 && outIx < outLimit) { 1275 if (c < 0x80 && outIx < outLimit) {
1291 UNSAFE.putByte(outIx++, (byte) c); 1276 UnsafeUtil.putByte(outIx++, (byte) c);
1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1294 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1295 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1280 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s 1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s
1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1298 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1299 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1300 } else if (outIx <= outLimit - 4L) { 1285 } else if (outIx <= outLimit - 4L) {
1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8 1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8
1302 // bytes 1287 // bytes
1303 final char low; 1288 final char low;
1304 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) { 1289 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx )))) {
1305 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1290 throw new UnpairedSurrogateException((inIx - 1), inLimit);
1306 } 1291 }
1307 int codePoint = toCodePoint(c, low); 1292 int codePoint = toCodePoint(c, low);
1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1309 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)) ));
1310 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))) );
1311 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); 1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint)));
1312 } else { 1297 } else {
1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1314 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) { 1299 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1) ))) {
1315 // We are surrogates and we're not a surrogate pair. 1300 // We are surrogates and we're not a surrogate pair.
1316 throw new UnpairedSurrogateException(inIx, inLimit); 1301 throw new UnpairedSurrogateException(inIx, inLimit);
1317 } 1302 }
1318 // Not enough space in the output buffer. 1303 // Not enough space in the output buffer.
1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1320 } 1305 }
1321 } 1306 }
(...skipping 20 matching lines...) Expand all
1342 return 0; 1327 return 0;
1343 } 1328 }
1344 1329
1345 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow. 1330 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.
1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that 1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that
1347 // the index (relative to the start of the array) is also 8-byte aligned. We do this by 1332 // the index (relative to the start of the array) is also 8-byte aligned. We do this by
1348 // ANDing the index with 7 to determine the number of bytes that need to b e read before 1333 // ANDing the index with 7 to determine the number of bytes that need to b e read before
1349 // we're 8-byte aligned. 1334 // we're 8-byte aligned.
1350 final int unaligned = (int) offset & 7; 1335 final int unaligned = (int) offset & 7;
1351 for (int j = unaligned; j > 0; j--) { 1336 for (int j = unaligned; j > 0; j--) {
1352 if (UNSAFE.getByte(bytes, offset++) < 0) { 1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) {
1353 return unaligned - j; 1338 return unaligned - j;
1354 } 1339 }
1355 } 1340 }
1356 1341
1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ). 1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).
1358 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1343 // To speed things up further, we're reading longs instead of bytes so we use a mask to
1359 // determine if any byte in the current long is non-ASCII. 1344 // determine if any byte in the current long is non-ASCII.
1360 remaining -= unaligned; 1345 remaining -= unaligned;
1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG) == 0; 1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_L ONG) == 0;
1362 offset += 8, remaining -= 8) {} 1347 offset += 8, remaining -= 8) {}
1363 return maxChars - remaining; 1348 return maxChars - remaining;
1364 } 1349 }
1365 1350
1366 /** 1351 /**
1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the 1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the
1368 * most efficient method available to the platform. 1353 * most efficient method available to the platform.
1369 */ 1354 */
1370 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) { 1355 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) {
1371 int remaining = maxChars; 1356 int remaining = maxChars;
1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1373 // Don't bother with small strings. 1358 // Don't bother with small strings.
1374 return 0; 1359 return 0;
1375 } 1360 }
1376 1361
1377 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow. 1362 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.
1378 // We do this by ANDing the address with 7 to determine the number of byte s that need to 1363 // We do this by ANDing the address with 7 to determine the number of byte s that need to
1379 // be read before we're 8-byte aligned. 1364 // be read before we're 8-byte aligned.
1380 final int unaligned = (int) address & 7; 1365 final int unaligned = (int) address & 7;
1381 for (int j = unaligned; j > 0; j--) { 1366 for (int j = unaligned; j > 0; j--) {
1382 if (UNSAFE.getByte(address++) < 0) { 1367 if (UnsafeUtil.getByte(address++) < 0) {
1383 return unaligned - j; 1368 return unaligned - j;
1384 } 1369 }
1385 } 1370 }
1386 1371
1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ). 1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).
1388 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1373 // To speed things up further, we're reading longs instead of bytes so we use a mask to
1389 // determine if any byte in the current long is non-ASCII. 1374 // determine if any byte in the current long is non-ASCII.
1390 remaining -= unaligned; 1375 remaining -= unaligned;
1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0; 1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) = = 0;
1392 address += 8, remaining -= 8) {} 1377 address += 8, remaining -= 8) {}
1393 return maxChars - remaining; 1378 return maxChars - remaining;
1394 } 1379 }
1395 1380
1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) { 1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) {
1397 // Skip past ASCII characters as quickly as possible. 1382 // Skip past ASCII characters as quickly as possible.
1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g); 1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g);
1399 remaining -= skipped; 1384 remaining -= skipped;
1400 offset += skipped; 1385 offset += skipped;
1401 1386
1402 for (;;) { 1387 for (;;) {
1403 // Optimize for interior runs of ASCII bytes. 1388 // Optimize for interior runs of ASCII bytes.
1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold? 1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?
1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e? 1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?
1406 int byte1 = 0; 1391 int byte1 = 0;
1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0; --remaining) { 1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {
1408 } 1393 }
1409 if (remaining == 0) { 1394 if (remaining == 0) {
1410 return COMPLETE; 1395 return COMPLETE;
1411 } 1396 }
1412 remaining--; 1397 remaining--;
1413 1398
1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
1415 if (byte1 < (byte) 0xE0) { 1400 if (byte1 < (byte) 0xE0) {
1416 // Two-byte form (110xxxxx 10xxxxxx) 1401 // Two-byte form (110xxxxx 10xxxxxx)
1417 if (remaining == 0) { 1402 if (remaining == 0) {
1418 // Incomplete sequence 1403 // Incomplete sequence
1419 return byte1; 1404 return byte1;
1420 } 1405 }
1421 remaining--; 1406 remaining--;
1422 1407
1423 // Simultaneously checks for illegal trailing-byte in 1408 // Simultaneously checks for illegal trailing-byte in
1424 // leading position and overlong 2-byte form. 1409 // leading position and overlong 2-byte form.
1425 if (byte1 < (byte) 0xC2 1410 if (byte1 < (byte) 0xC2
1426 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1411 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1427 return MALFORMED; 1412 return MALFORMED;
1428 } 1413 }
1429 } else if (byte1 < (byte) 0xF0) { 1414 } else if (byte1 < (byte) 0xF0) {
1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
1431 if (remaining < 2) { 1416 if (remaining < 2) {
1432 // Incomplete sequence 1417 // Incomplete sequence
1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1434 } 1419 }
1435 remaining -= 2; 1420 remaining -= 2;
1436 1421
1437 final int byte2; 1422 final int byte2;
1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF 1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1439 // overlong? 5 most significant bits must not all be zero 1424 // overlong? 5 most significant bits must not all be zero
1440 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1425 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1441 // check for illegal surrogate codepoints 1426 // check for illegal surrogate codepoints
1442 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1427 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1443 // byte3 trailing-byte test 1428 // byte3 trailing-byte test
1444 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1429 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1445 return MALFORMED; 1430 return MALFORMED;
1446 } 1431 }
1447 } else { 1432 } else {
1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
1449 if (remaining < 3) { 1434 if (remaining < 3) {
1450 // Incomplete sequence 1435 // Incomplete sequence
1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1452 } 1437 }
1453 remaining -= 3; 1438 remaining -= 3;
1454 1439
1455 final int byte2; 1440 final int byte2;
1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF 1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1457 // Check that 1 <= plane <= 16. Tricky optimized form of: 1442 // Check that 1 <= plane <= 16. Tricky optimized form of:
1458 // if (byte1 > (byte) 0xF4 || 1443 // if (byte1 > (byte) 0xF4 ||
1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1461 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1446 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1462 // byte3 trailing-byte test 1447 // byte3 trailing-byte test
1463 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF 1448 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF
1464 // byte4 trailing-byte test 1449 // byte4 trailing-byte test
1465 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1450 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1466 return MALFORMED; 1451 return MALFORMED;
1467 } 1452 }
1468 } 1453 }
1469 } 1454 }
1470 } 1455 }
1471 1456
1472 private static int partialIsValidUtf8(long address, int remaining) { 1457 private static int partialIsValidUtf8(long address, int remaining) {
1473 // Skip past ASCII characters as quickly as possible. 1458 // Skip past ASCII characters as quickly as possible.
1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); 1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);
1475 address += skipped; 1460 address += skipped;
1476 remaining -= skipped; 1461 remaining -= skipped;
1477 1462
1478 for (;;) { 1463 for (;;) {
1479 // Optimize for interior runs of ASCII bytes. 1464 // Optimize for interior runs of ASCII bytes.
1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold? 1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?
1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e? 1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?
1482 int byte1 = 0; 1467 int byte1 = 0;
1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --rema ining) { 1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; -- remaining) {
1484 } 1469 }
1485 if (remaining == 0) { 1470 if (remaining == 0) {
1486 return COMPLETE; 1471 return COMPLETE;
1487 } 1472 }
1488 remaining--; 1473 remaining--;
1489 1474
1490 if (byte1 < (byte) 0xE0) { 1475 if (byte1 < (byte) 0xE0) {
1491 // Two-byte form 1476 // Two-byte form
1492 1477
1493 if (remaining == 0) { 1478 if (remaining == 0) {
1494 // Incomplete sequence 1479 // Incomplete sequence
1495 return byte1; 1480 return byte1;
1496 } 1481 }
1497 remaining--; 1482 remaining--;
1498 1483
1499 // Simultaneously checks for illegal trailing-byte in 1484 // Simultaneously checks for illegal trailing-byte in
1500 // leading position and overlong 2-byte form. 1485 // leading position and overlong 2-byte form.
1501 if (byte1 < (byte) 0xC2 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1486 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF ) {
1502 return MALFORMED; 1487 return MALFORMED;
1503 } 1488 }
1504 } else if (byte1 < (byte) 0xF0) { 1489 } else if (byte1 < (byte) 0xF0) {
1505 // Three-byte form 1490 // Three-byte form
1506 1491
1507 if (remaining < 2) { 1492 if (remaining < 2) {
1508 // Incomplete sequence 1493 // Incomplete sequence
1509 return unsafeIncompleteStateFor(address, byte1, remaining); 1494 return unsafeIncompleteStateFor(address, byte1, remaining);
1510 } 1495 }
1511 remaining -= 2; 1496 remaining -= 2;
1512 1497
1513 final byte byte2 = UNSAFE.getByte(address++); 1498 final byte byte2 = UnsafeUtil.getByte(address++);
1514 if (byte2 > (byte) 0xBF 1499 if (byte2 > (byte) 0xBF
1515 // overlong? 5 most significant bits must not all be zero 1500 // overlong? 5 most significant bits must not all be zero
1516 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1501 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1517 // check for illegal surrogate codepoints 1502 // check for illegal surrogate codepoints
1518 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1503 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1519 // byte3 trailing-byte test 1504 // byte3 trailing-byte test
1520 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1505 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1521 return MALFORMED; 1506 return MALFORMED;
1522 } 1507 }
1523 } else { 1508 } else {
1524 // Four-byte form 1509 // Four-byte form
1525 1510
1526 if (remaining < 3) { 1511 if (remaining < 3) {
1527 // Incomplete sequence 1512 // Incomplete sequence
1528 return unsafeIncompleteStateFor(address, byte1, remaining); 1513 return unsafeIncompleteStateFor(address, byte1, remaining);
1529 } 1514 }
1530 remaining -= 3; 1515 remaining -= 3;
1531 1516
1532 final byte byte2 = UNSAFE.getByte(address++); 1517 final byte byte2 = UnsafeUtil.getByte(address++);
1533 if (byte2 > (byte) 0xBF 1518 if (byte2 > (byte) 0xBF
1534 // Check that 1 <= plane <= 16. Tricky optimized form of: 1519 // Check that 1 <= plane <= 16. Tricky optimized form of:
1535 // if (byte1 > (byte) 0xF4 || 1520 // if (byte1 > (byte) 0xF4 ||
1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1538 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1523 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1539 // byte3 trailing-byte test 1524 // byte3 trailing-byte test
1540 || UNSAFE.getByte(address++) > (byte) 0xBF 1525 || UnsafeUtil.getByte(address++) > (byte) 0xBF
1541 // byte4 trailing-byte test 1526 // byte4 trailing-byte test
1542 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1527 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1543 return MALFORMED; 1528 return MALFORMED;
1544 } 1529 }
1545 } 1530 }
1546 } 1531 }
1547 } 1532 }
1548 1533
1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset, 1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset,
1550 int remaining) { 1535 int remaining) {
1551 switch (remaining) { 1536 switch (remaining) {
1552 case 0: { 1537 case 0: {
1553 return incompleteStateFor(byte1); 1538 return incompleteStateFor(byte1);
1554 } 1539 }
1555 case 1: { 1540 case 1: {
1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset)); 1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
1557 } 1542 }
1558 case 2: { 1543 case 2: {
1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset), 1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset),
1560 UNSAFE.getByte(bytes, offset + 1)); 1545 UnsafeUtil.getByte(bytes, offset + 1));
1561 } 1546 }
1562 default: { 1547 default: {
1563 throw new AssertionError(); 1548 throw new AssertionError();
1564 } 1549 }
1565 } 1550 }
1566 } 1551 }
1567 1552
1568 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) { 1553 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) {
1569 switch (remaining) { 1554 switch (remaining) {
1570 case 0: { 1555 case 0: {
1571 return incompleteStateFor(byte1); 1556 return incompleteStateFor(byte1);
1572 } 1557 }
1573 case 1: { 1558 case 1: {
1574 return incompleteStateFor(byte1, UNSAFE.getByte(address)); 1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
1575 } 1560 }
1576 case 2: { 1561 case 2: {
1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getBy te(address + 1)); 1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address),
1563 UnsafeUtil.getByte(address + 1));
1578 } 1564 }
1579 default: { 1565 default: {
1580 throw new AssertionError(); 1566 throw new AssertionError();
1581 } 1567 }
1582 } 1568 }
1583 } 1569 }
1584
1585 /**
1586 * Gets the field with the given name within the class, or {@code null} if n ot found. If
1587 * found, the field is made accessible.
1588 */
1589 private static Field field(Class<?> clazz, String fieldName) {
1590 Field field;
1591 try {
1592 field = clazz.getDeclaredField(fieldName);
1593 field.setAccessible(true);
1594 } catch (Throwable t) {
1595 // Failed to access the fields.
1596 field = null;
1597 }
1598 logger.log(Level.FINEST, "{0}.{1}: {2}",
1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available" : "unavailable")});
1600 return field;
1601 }
1602
1603 /**
1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.mis c.Unsafe} is not
1605 * available.
1606 */
1607 private static long fieldOffset(Field field) {
1608 return field == null || UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(fie ld);
1609 }
1610
1611 /**
1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Uns afe} is not
1613 * available.
1614 */
1615 private static <T> int byteArrayBaseOffset() {
1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class);
1617 }
1618
1619 /**
1620 * Gets the offset of the {@code address} field of the given direct {@link B yteBuffer}.
1621 */
1622 private static long addressOffset(ByteBuffer buffer) {
1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET);
1624 }
1625
1626 /**
1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not availab le on this
1628 * platform.
1629 */
1630 private static sun.misc.Unsafe getUnsafe() {
1631 sun.misc.Unsafe unsafe = null;
1632 try {
1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun .misc.Unsafe>() {
1634 @Override
1635 public sun.misc.Unsafe run() throws Exception {
1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class;
1637
1638 // Check that this platform supports all of the required unsafe meth ods.
1639 checkRequiredMethods(k);
1640
1641 for (Field f : k.getDeclaredFields()) {
1642 f.setAccessible(true);
1643 Object x = f.get(null);
1644 if (k.isInstance(x)) {
1645 return k.cast(x);
1646 }
1647 }
1648 // The sun.misc.Unsafe field does not exist.
1649 return null;
1650 }
1651 });
1652 } catch (Throwable e) {
1653 // Catching Throwable here due to the fact that Google AppEngine raises NoClassDefFoundError
1654 // for Unsafe.
1655 }
1656
1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}",
1658 unsafe != null ? "available" : "unavailable");
1659 return unsafe;
1660 }
1661
1662 /**
1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are availab le on this platform.
1664 */
1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz)
1666 throws NoSuchMethodException, SecurityException {
1667 // Needed for Unsafe byte[] access
1668 clazz.getMethod("arrayBaseOffset", Class.class);
1669 clazz.getMethod("getByte", Object.class, long.class);
1670 clazz.getMethod("putByte", Object.class, long.class, byte.class);
1671 clazz.getMethod("getLong", Object.class, long.class);
1672
1673 // Needed for Unsafe Direct ByteBuffer access
1674 clazz.getMethod("objectFieldOffset", Field.class);
1675 clazz.getMethod("getByte", long.class);
1676 clazz.getMethod("getLong", Object.class, long.class);
1677 clazz.getMethod("putByte", long.class, byte.class);
1678 clazz.getMethod("getLong", long.class);
1679 }
1680 } 1570 }
1681 1571
1682 private Utf8() {} 1572 private Utf8() {}
1683 } 1573 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698