third_party/protobuf/java/core/src/main/java/com/google/protobuf/Utf8.java - Issue 2599263002: third_party/protobuf: Update to HEAD (f52e188fe4)

Side by Side Diff: third_party/protobuf/java/core/src/main/java/com/google/protobuf/Utf8.java

Issue 2599263002: third_party/protobuf: Update to HEAD (f52e188fe4) (Closed)

Patch Set: Address comments Created 3 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/protobuf/java/core/src/main/java/com/google/protobuf/UnsafeUtil.java ('k') | third_party/protobuf/java/core/src/main/java/com/google/protobuf/WireFormat.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Protocol Buffers - Google's data interchange format	1 // Protocol Buffers - Google's data interchange format

2 // Copyright 2008 Google Inc. All rights reserved.	2 // Copyright 2008 Google Inc. All rights reserved.

3 // https://developers.google.com/protocol-buffers/	3 // https://developers.google.com/protocol-buffers/

4 //	4 //

5 // Redistribution and use in source and binary forms, with or without	5 // Redistribution and use in source and binary forms, with or without

6 // modification, are permitted provided that the following conditions are	6 // modification, are permitted provided that the following conditions are

7 // met:	7 // met:

8 //	8 //

9 // * Redistributions of source code must retain the above copyright	9 // * Redistributions of source code must retain the above copyright

10 // notice, this list of conditions and the following disclaimer.	10 // notice, this list of conditions and the following disclaimer.

(...skipping 12 matching lines...) Expand all Loading...
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,	23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

30	30

31 package com.google.protobuf;	31 package com.google.protobuf;

32	32

	33 import static com.google.protobuf.UnsafeUtil.addressOffset;

	34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset;

	35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;

	36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;

33 import static java.lang.Character.MAX_SURROGATE;	37 import static java.lang.Character.MAX_SURROGATE;

34 import static java.lang.Character.MIN_SURROGATE;	38 import static java.lang.Character.MIN_SURROGATE;

35 import static java.lang.Character.isSurrogatePair;	39 import static java.lang.Character.isSurrogatePair;

36 import static java.lang.Character.toCodePoint;	40 import static java.lang.Character.toCodePoint;

37	41

38 import java.lang.reflect.Field;

39 import java.nio.Buffer;

40 import java.nio.ByteBuffer;	42 import java.nio.ByteBuffer;

41 import java.security.AccessController;

42 import java.security.PrivilegedExceptionAction;

43 import java.util.logging.Level;

44 import java.util.logging.Logger;

45	43

46 /**	44 /**

47 * A set of low-level, high-performance static utility methods related	45 * A set of low-level, high-performance static utility methods related

48 * to the UTF-8 character encoding. This class has no dependencies	46 * to the UTF-8 character encoding. This class has no dependencies

49 * outside of the core JDK libraries.	47 * outside of the core JDK libraries.

50 *	48 *

51 * <p>There are several variants of UTF-8. The one implemented by	49 * <p>There are several variants of UTF-8. The one implemented by

52 * this class is the restricted definition of UTF-8 introduced in	50 * this class is the restricted definition of UTF-8 introduced in

53 * Unicode 3.1, which mandates the rejection of "overlong" byte	51 * Unicode 3.1, which mandates the rejection of "overlong" byte

54 * sequences as well as rejection of 3-byte surrogate codepoint byte	52 * sequences as well as rejection of 3-byte surrogate codepoint byte

(...skipping 17 matching lines...) Expand all Loading...
72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is	70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is

73 * well-formed in the absence of additional input, or if the byte sequence	71 * well-formed in the absence of additional input, or if the byte sequence

74 * apparently terminated in the middle of a character, an opaque integer	72 * apparently terminated in the middle of a character, an opaque integer

75 * "state" value containing enough information to decode the character when	73 * "state" value containing enough information to decode the character when

76 * passed to a subsequent invocation of a partial decoding method.	74 * passed to a subsequent invocation of a partial decoding method.

77 *	75 *

78 * @author martinrb@google.com (Martin Buchholz)	76 * @author martinrb@google.com (Martin Buchholz)

79 */	77 */

80 // TODO(nathanmittler): Copy changes in this class back to Guava	78 // TODO(nathanmittler): Copy changes in this class back to Guava

81 final class Utf8 {	79 final class Utf8 {

82 private static final Logger logger = Logger.getLogger(Utf8.class.getName());

83	80

84 /**	81 /**

85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations	82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations

86 * depending on what is available on the platform. The processor is the platfo rm-optimized	83 * depending on what is available on the platform. The processor is the platfo rm-optimized

87 * delegate for which all methods are delegated directly to.	84 * delegate for which all methods are delegated directly to.

88 */	85 */

89 private static final Processor processor =	86 private static final Processor processor =

90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( );	87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( );

91	88

92 /**	89 /**

(...skipping 137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
230 default:	227 default:

231 throw new AssertionError();	228 throw new AssertionError();

232 }	229 }

233 }	230 }

234	231

235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw	232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw

236 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can	233 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can

237 // fallback to more lenient behavior.	234 // fallback to more lenient behavior.

238	235

239 static class UnpairedSurrogateException extends IllegalArgumentException {	236 static class UnpairedSurrogateException extends IllegalArgumentException {

240 private UnpairedSurrogateException(int index, int length) {	237 UnpairedSurrogateException(int index, int length) {

241 super("Unpaired surrogate at index " + index + " of " + length);	238 super("Unpaired surrogate at index " + index + " of " + length);

242 }	239 }

243 }	240 }

244	241

245 /**	242 /**

246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,	243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,

247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in	244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in

248 * both time and space.	245 * both time and space.

249 *	246 *

250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired	247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired

(...skipping 733 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
984 }	981 }

985 }	982 }

986 }	983 }

987 }	984 }

988 }	985 }

989	986

990 /**	987 /**

991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance.	988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance.

992 */	989 */

993 static final class UnsafeProcessor extends Processor {	990 static final class UnsafeProcessor extends Processor {

994 private static final sun.misc.Unsafe UNSAFE = getUnsafe();

995 private static final long BUFFER_ADDRESS_OFFSET =

996 fieldOffset(field(Buffer.class, "address"));

997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset();

998

999 /**

1000 * We only use Unsafe operations if we have access to direct {@link ByteBuff er}'s address

1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong()) .

1002 */

1003 private static final boolean AVAILABLE =

1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0;

1005

1006 /**	991 /**

1007 * Indicates whether or not all required unsafe operations are supported on this platform.	992 * Indicates whether or not all required unsafe operations are supported on this platform.

1008 */	993 */

1009 static boolean isAvailable() {	994 static boolean isAvailable() {

1010 return AVAILABLE;	995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();

1011 }	996 }

1012	997

1013 @Override	998 @Override

1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) {	999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) {

1015 if ((index \| limit \| bytes.length - limit) < 0) {	1000 if ((index \| limit \| bytes.length - limit) < 0) {

1016 throw new ArrayIndexOutOfBoundsException(	1001 throw new ArrayIndexOutOfBoundsException(

1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit));	1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit));

1018 }	1003 }

1019 long offset = ARRAY_BASE_OFFSET + index;	1004 long offset = getArrayBaseOffset() + index;

1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit;	1005 final long offsetLimit = getArrayBaseOffset() + limit;

1021 if (state != COMPLETE) {	1006 if (state != COMPLETE) {

1022 // The previous decoding operation was incomplete (or malformed).	1007 // The previous decoding operation was incomplete (or malformed).

1023 // We look for a well-formed sequence consisting of bytes from	1008 // We look for a well-formed sequence consisting of bytes from

1024 // the previous decoding operation (stored in state) together	1009 // the previous decoding operation (stored in state) together

1025 // with bytes from the array slice.	1010 // with bytes from the array slice.

1026 //	1011 //

1027 // We expect such "straddler characters" to be rare.	1012 // We expect such "straddler characters" to be rare.

1028	1013

1029 if (offset >= offsetLimit) { // No bytes? No progress.	1014 if (offset >= offsetLimit) { // No bytes? No progress.

1030 return state;	1015 return state;

1031 }	1016 }

1032 int byte1 = (byte) state;	1017 int byte1 = (byte) state;

1033 // byte1 is never ASCII.	1018 // byte1 is never ASCII.

1034 if (byte1 < (byte) 0xE0) {	1019 if (byte1 < (byte) 0xE0) {

1035 // two-byte form	1020 // two-byte form

1036	1021

1037 // Simultaneously checks for illegal trailing-byte in	1022 // Simultaneously checks for illegal trailing-byte in

1038 // leading position and overlong 2-byte form.	1023 // leading position and overlong 2-byte form.

1039 if (byte1 < (byte) 0xC2	1024 if (byte1 < (byte) 0xC2

1040 // byte2 trailing-byte test	1025 // byte2 trailing-byte test

1041 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {	1026 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {

1042 return MALFORMED;	1027 return MALFORMED;

1043 }	1028 }

1044 } else if (byte1 < (byte) 0xF0) {	1029 } else if (byte1 < (byte) 0xF0) {

1045 // three-byte form	1030 // three-byte form

1046	1031

1047 // Get byte2 from saved state or array	1032 // Get byte2 from saved state or array

1048 int byte2 = (byte) ~(state >> 8);	1033 int byte2 = (byte) ~(state >> 8);

1049 if (byte2 == 0) {	1034 if (byte2 == 0) {

1050 byte2 = UNSAFE.getByte(bytes, offset++);	1035 byte2 = UnsafeUtil.getByte(bytes, offset++);

1051 if (offset >= offsetLimit) {	1036 if (offset >= offsetLimit) {

1052 return incompleteStateFor(byte1, byte2);	1037 return incompleteStateFor(byte1, byte2);

1053 }	1038 }

1054 }	1039 }

1055 if (byte2 > (byte) 0xBF	1040 if (byte2 > (byte) 0xBF

1056 // overlong? 5 most significant bits must not all be zero	1041 // overlong? 5 most significant bits must not all be zero

1057 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1042 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1058 // illegal surrogate codepoint?	1043 // illegal surrogate codepoint?

1059 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1044 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1060 // byte3 trailing-byte test	1045 // byte3 trailing-byte test

1061 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {	1046 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {

1062 return MALFORMED;	1047 return MALFORMED;

1063 }	1048 }

1064 } else {	1049 } else {

1065 // four-byte form	1050 // four-byte form

1066	1051

1067 // Get byte2 and byte3 from saved state or array	1052 // Get byte2 and byte3 from saved state or array

1068 int byte2 = (byte) ~(state >> 8);	1053 int byte2 = (byte) ~(state >> 8);

1069 int byte3 = 0;	1054 int byte3 = 0;

1070 if (byte2 == 0) {	1055 if (byte2 == 0) {

1071 byte2 = UNSAFE.getByte(bytes, offset++);	1056 byte2 = UnsafeUtil.getByte(bytes, offset++);

1072 if (offset >= offsetLimit) {	1057 if (offset >= offsetLimit) {

1073 return incompleteStateFor(byte1, byte2);	1058 return incompleteStateFor(byte1, byte2);

1074 }	1059 }

1075 } else {	1060 } else {

1076 byte3 = (byte) (state >> 16);	1061 byte3 = (byte) (state >> 16);

1077 }	1062 }

1078 if (byte3 == 0) {	1063 if (byte3 == 0) {

1079 byte3 = UNSAFE.getByte(bytes, offset++);	1064 byte3 = UnsafeUtil.getByte(bytes, offset++);

1080 if (offset >= offsetLimit) {	1065 if (offset >= offsetLimit) {

1081 return incompleteStateFor(byte1, byte2, byte3);	1066 return incompleteStateFor(byte1, byte2, byte3);

1082 }	1067 }

1083 }	1068 }

1084	1069

1085 // If we were called with state == MALFORMED, then byte1 is 0xFF,	1070 // If we were called with state == MALFORMED, then byte1 is 0xFF,

1086 // which never occurs in well-formed UTF-8, and so we will return	1071 // which never occurs in well-formed UTF-8, and so we will return

1087 // MALFORMED again below.	1072 // MALFORMED again below.

1088	1073

1089 if (byte2 > (byte) 0xBF	1074 if (byte2 > (byte) 0xBF

1090 // Check that 1 <= plane <= 16. Tricky optimized form of:	1075 // Check that 1 <= plane <= 16. Tricky optimized form of:

1091 // if (byte1 > (byte) 0xF4 \|\|	1076 // if (byte1 > (byte) 0xF4 \|\|

1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1094 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1079 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1095 // byte3 trailing-byte test	1080 // byte3 trailing-byte test

1096 \|\| byte3 > (byte) 0xBF	1081 \|\| byte3 > (byte) 0xBF

1097 // byte4 trailing-byte test	1082 // byte4 trailing-byte test

1098 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {	1083 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {

1099 return MALFORMED;	1084 return MALFORMED;

1100 }	1085 }

1101 }	1086 }

1102 }	1087 }

1103	1088

1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));	1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));

1105 }	1090 }

1106	1091

1107 @Override	1092 @Override

1108 int partialIsValidUtf8Direct(	1093 int partialIsValidUtf8Direct(

(...skipping 18 matching lines...) Expand all Loading...
1127	1112

1128 final int byte1 = (byte) state;	1113 final int byte1 = (byte) state;

1129 // byte1 is never ASCII.	1114 // byte1 is never ASCII.

1130 if (byte1 < (byte) 0xE0) {	1115 if (byte1 < (byte) 0xE0) {

1131 // two-byte form	1116 // two-byte form

1132	1117

1133 // Simultaneously checks for illegal trailing-byte in	1118 // Simultaneously checks for illegal trailing-byte in

1134 // leading position and overlong 2-byte form.	1119 // leading position and overlong 2-byte form.

1135 if (byte1 < (byte) 0xC2	1120 if (byte1 < (byte) 0xC2

1136 // byte2 trailing-byte test	1121 // byte2 trailing-byte test

1137 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {	1122 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {

1138 return MALFORMED;	1123 return MALFORMED;

1139 }	1124 }

1140 } else if (byte1 < (byte) 0xF0) {	1125 } else if (byte1 < (byte) 0xF0) {

1141 // three-byte form	1126 // three-byte form

1142	1127

1143 // Get byte2 from saved state or array	1128 // Get byte2 from saved state or array

1144 int byte2 = (byte) ~(state >> 8);	1129 int byte2 = (byte) ~(state >> 8);

1145 if (byte2 == 0) {	1130 if (byte2 == 0) {

1146 byte2 = UNSAFE.getByte(address++);	1131 byte2 = UnsafeUtil.getByte(address++);

1147 if (address >= addressLimit) {	1132 if (address >= addressLimit) {

1148 return incompleteStateFor(byte1, byte2);	1133 return incompleteStateFor(byte1, byte2);

1149 }	1134 }

1150 }	1135 }

1151 if (byte2 > (byte) 0xBF	1136 if (byte2 > (byte) 0xBF

1152 // overlong? 5 most significant bits must not all be zero	1137 // overlong? 5 most significant bits must not all be zero

1153 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1138 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1154 // illegal surrogate codepoint?	1139 // illegal surrogate codepoint?

1155 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1140 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1156 // byte3 trailing-byte test	1141 // byte3 trailing-byte test

1157 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {	1142 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {

1158 return MALFORMED;	1143 return MALFORMED;

1159 }	1144 }

1160 } else {	1145 } else {

1161 // four-byte form	1146 // four-byte form

1162	1147

1163 // Get byte2 and byte3 from saved state or array	1148 // Get byte2 and byte3 from saved state or array

1164 int byte2 = (byte) ~(state >> 8);	1149 int byte2 = (byte) ~(state >> 8);

1165 int byte3 = 0;	1150 int byte3 = 0;

1166 if (byte2 == 0) {	1151 if (byte2 == 0) {

1167 byte2 = UNSAFE.getByte(address++);	1152 byte2 = UnsafeUtil.getByte(address++);

1168 if (address >= addressLimit) {	1153 if (address >= addressLimit) {

1169 return incompleteStateFor(byte1, byte2);	1154 return incompleteStateFor(byte1, byte2);

1170 }	1155 }

1171 } else {	1156 } else {

1172 byte3 = (byte) (state >> 16);	1157 byte3 = (byte) (state >> 16);

1173 }	1158 }

1174 if (byte3 == 0) {	1159 if (byte3 == 0) {

1175 byte3 = UNSAFE.getByte(address++);	1160 byte3 = UnsafeUtil.getByte(address++);

1176 if (address >= addressLimit) {	1161 if (address >= addressLimit) {

1177 return incompleteStateFor(byte1, byte2, byte3);	1162 return incompleteStateFor(byte1, byte2, byte3);

1178 }	1163 }

1179 }	1164 }

1180	1165

1181 // If we were called with state == MALFORMED, then byte1 is 0xFF,	1166 // If we were called with state == MALFORMED, then byte1 is 0xFF,

1182 // which never occurs in well-formed UTF-8, and so we will return	1167 // which never occurs in well-formed UTF-8, and so we will return

1183 // MALFORMED again below.	1168 // MALFORMED again below.

1184	1169

1185 if (byte2 > (byte) 0xBF	1170 if (byte2 > (byte) 0xBF

1186 // Check that 1 <= plane <= 16. Tricky optimized form of:	1171 // Check that 1 <= plane <= 16. Tricky optimized form of:

1187 // if (byte1 > (byte) 0xF4 \|\|	1172 // if (byte1 > (byte) 0xF4 \|\|

1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1190 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1175 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1191 // byte3 trailing-byte test	1176 // byte3 trailing-byte test

1192 \|\| byte3 > (byte) 0xBF	1177 \|\| byte3 > (byte) 0xBF

1193 // byte4 trailing-byte test	1178 // byte4 trailing-byte test

1194 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {	1179 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {

1195 return MALFORMED;	1180 return MALFORMED;

1196 }	1181 }

1197 }	1182 }

1198 }	1183 }

1199	1184

1200 return partialIsValidUtf8(address, (int) (addressLimit - address));	1185 return partialIsValidUtf8(address, (int) (addressLimit - address));

1201 }	1186 }

1202	1187

1203 @Override	1188 @Override

1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) {	1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) {

1205 long outIx = ARRAY_BASE_OFFSET + offset;	1190 long outIx = getArrayBaseOffset() + offset;

1206 final long outLimit = outIx + length;	1191 final long outLimit = outIx + length;

1207 final int inLimit = in.length();	1192 final int inLimit = in.length();

1208 if (inLimit > length \|\| out.length - length < offset) {	1193 if (inLimit > length \|\| out.length - length < offset) {

1209 // Not even enough room for an ASCII-encoded string.	1194 // Not even enough room for an ASCII-encoded string.

1210 throw new ArrayIndexOutOfBoundsException(	1195 throw new ArrayIndexOutOfBoundsException(

1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));	1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));

1212 }	1197 }

1213	1198

1214 // Designed to take advantage of	1199 // Designed to take advantage of

1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination	1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination

1216 int inIx = 0;	1201 int inIx = 0;

1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {	1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {

1218 UNSAFE.putByte(out, outIx++, (byte) c);	1203 UnsafeUtil.putByte(out, outIx++, (byte) c);

1219 }	1204 }

1220 if (inIx == inLimit) {	1205 if (inIx == inLimit) {

1221 // We're done, it was ASCII encoded.	1206 // We're done, it was ASCII encoded.

1222 return (int) (outIx - ARRAY_BASE_OFFSET);	1207 return (int) (outIx - getArrayBaseOffset());

1223 }	1208 }

1224	1209

1225 for (char c; inIx < inLimit; ++inIx) {	1210 for (char c; inIx < inLimit; ++inIx) {

1226 c = in.charAt(inIx);	1211 c = in.charAt(inIx);

1227 if (c < 0x80 && outIx < outLimit) {	1212 if (c < 0x80 && outIx < outLimit) {

1228 UNSAFE.putByte(out, outIx++, (byte) c);	1213 UnsafeUtil.putByte(out, outIx++, (byte) c);

1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes	1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes

1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) \| (c >>> 6)));	1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) \| (c >>> 6)));

1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));	1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));

1232 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {	1217 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {

1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s	1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s

1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) \| (c >>> 12)));	1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) \| (c >>> 12)));

1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));	1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));

1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));	1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));

1237 } else if (outIx <= outLimit - 4L) {	1222 } else if (outIx <= outLimit - 4L) {

1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8	1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8

1239 // bytes	1224 // bytes

1240 final char low;	1225 final char low;

1241 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {	1226 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {

1242 throw new UnpairedSurrogateException((inIx - 1), inLimit);	1227 throw new UnpairedSurrogateException((inIx - 1), inLimit);

1243 }	1228 }

1244 int codePoint = toCodePoint(c, low);	1229 int codePoint = toCodePoint(c, low);

1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) \| (codePoint >>> 18))) ;	1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) \| (codePoint >>> 1 8)));

1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12) )));	1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12))));

1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6)) ));	1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6))));

1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & codePoint)));	1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & codePoint)));

1249 } else {	1234 } else {

1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)	1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)

1251 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {	1236 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {

1252 // We are surrogates and we're not a surrogate pair.	1237 // We are surrogates and we're not a surrogate pair.

1253 throw new UnpairedSurrogateException(inIx, inLimit);	1238 throw new UnpairedSurrogateException(inIx, inLimit);

1254 }	1239 }

1255 // Not enough space in the output buffer.	1240 // Not enough space in the output buffer.

1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);	1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);

1257 }	1242 }

1258 }	1243 }

1259	1244

1260 // All bytes have been encoded.	1245 // All bytes have been encoded.

1261 return (int) (outIx - ARRAY_BASE_OFFSET);	1246 return (int) (outIx - getArrayBaseOffset());

1262 }	1247 }

1263	1248

1264 @Override	1249 @Override

1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {	1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {

1266 final long address = addressOffset(out);	1251 final long address = addressOffset(out);

1267 long outIx = address + out.position();	1252 long outIx = address + out.position();

1268 final long outLimit = address + out.limit();	1253 final long outLimit = address + out.limit();

1269 final int inLimit = in.length();	1254 final int inLimit = in.length();

1270 if (inLimit > outLimit - outIx) {	1255 if (inLimit > outLimit - outIx) {

1271 // Not even enough room for an ASCII-encoded string.	1256 // Not even enough room for an ASCII-encoded string.

1272 throw new ArrayIndexOutOfBoundsException(	1257 throw new ArrayIndexOutOfBoundsException(

1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t());	1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t());

1274 }	1259 }

1275	1260

1276 // Designed to take advantage of	1261 // Designed to take advantage of

1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination	1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination

1278 int inIx = 0;	1263 int inIx = 0;

1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {	1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {

1280 UNSAFE.putByte(outIx++, (byte) c);	1265 UnsafeUtil.putByte(outIx++, (byte) c);

1281 }	1266 }

1282 if (inIx == inLimit) {	1267 if (inIx == inLimit) {

1283 // We're done, it was ASCII encoded.	1268 // We're done, it was ASCII encoded.

1284 out.position((int) (outIx - address));	1269 out.position((int) (outIx - address));

1285 return;	1270 return;

1286 }	1271 }

1287	1272

1288 for (char c; inIx < inLimit; ++inIx) {	1273 for (char c; inIx < inLimit; ++inIx) {

1289 c = in.charAt(inIx);	1274 c = in.charAt(inIx);

1290 if (c < 0x80 && outIx < outLimit) {	1275 if (c < 0x80 && outIx < outLimit) {

1291 UNSAFE.putByte(outIx++, (byte) c);	1276 UnsafeUtil.putByte(outIx++, (byte) c);

1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes	1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes

1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) \| (c >>> 6)));	1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) \| (c >>> 6)));

1294 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));	1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));

1295 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {	1280 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {

1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s	1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s

1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) \| (c >>> 12)));	1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) \| (c >>> 12)));

1298 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));	1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));

1299 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));	1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));

1300 } else if (outIx <= outLimit - 4L) {	1285 } else if (outIx <= outLimit - 4L) {

1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8	1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8

1302 // bytes	1287 // bytes

1303 final char low;	1288 final char low;

1304 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {	1289 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {

1305 throw new UnpairedSurrogateException((inIx - 1), inLimit);	1290 throw new UnpairedSurrogateException((inIx - 1), inLimit);

1306 }	1291 }

1307 int codePoint = toCodePoint(c, low);	1292 int codePoint = toCodePoint(c, low);

1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) \| (codePoint >>> 18)));	1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) \| (codePoint >>> 18)));

1309 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12))));	1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12)) ));

1310 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6))));	1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6))) );

1311 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & codePoint)));	1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & codePoint)));

1312 } else {	1297 } else {

1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)	1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)

1314 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {	1299 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {

1315 // We are surrogates and we're not a surrogate pair.	1300 // We are surrogates and we're not a surrogate pair.

1316 throw new UnpairedSurrogateException(inIx, inLimit);	1301 throw new UnpairedSurrogateException(inIx, inLimit);

1317 }	1302 }

1318 // Not enough space in the output buffer.	1303 // Not enough space in the output buffer.

1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);	1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);

1320 }	1305 }

1321 }	1306 }

(...skipping 20 matching lines...) Expand all Loading...
1342 return 0;	1327 return 0;

1343 }	1328 }

1344	1329

1345 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.	1330 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.

1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that	1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that

1347 // the index (relative to the start of the array) is also 8-byte aligned. We do this by	1332 // the index (relative to the start of the array) is also 8-byte aligned. We do this by

1348 // ANDing the index with 7 to determine the number of bytes that need to b e read before	1333 // ANDing the index with 7 to determine the number of bytes that need to b e read before

1349 // we're 8-byte aligned.	1334 // we're 8-byte aligned.

1350 final int unaligned = (int) offset & 7;	1335 final int unaligned = (int) offset & 7;

1351 for (int j = unaligned; j > 0; j--) {	1336 for (int j = unaligned; j > 0; j--) {

1352 if (UNSAFE.getByte(bytes, offset++) < 0) {	1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) {

1353 return unaligned - j;	1338 return unaligned - j;

1354 }	1339 }

1355 }	1340 }

1356	1341

1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).	1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).

1358 // To speed things up further, we're reading longs instead of bytes so we use a mask to	1343 // To speed things up further, we're reading longs instead of bytes so we use a mask to

1359 // determine if any byte in the current long is non-ASCII.	1344 // determine if any byte in the current long is non-ASCII.

1360 remaining -= unaligned;	1345 remaining -= unaligned;

1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG) == 0;	1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_L ONG) == 0;

1362 offset += 8, remaining -= 8) {}	1347 offset += 8, remaining -= 8) {}

1363 return maxChars - remaining;	1348 return maxChars - remaining;

1364 }	1349 }

1365	1350

1366 /**	1351 /**

1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the	1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the

1368 * most efficient method available to the platform.	1353 * most efficient method available to the platform.

1369 */	1354 */

1370 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) {	1355 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) {

1371 int remaining = maxChars;	1356 int remaining = maxChars;

1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {	1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {

1373 // Don't bother with small strings.	1358 // Don't bother with small strings.

1374 return 0;	1359 return 0;

1375 }	1360 }

1376	1361

1377 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.	1362 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.

1378 // We do this by ANDing the address with 7 to determine the number of byte s that need to	1363 // We do this by ANDing the address with 7 to determine the number of byte s that need to

1379 // be read before we're 8-byte aligned.	1364 // be read before we're 8-byte aligned.

1380 final int unaligned = (int) address & 7;	1365 final int unaligned = (int) address & 7;

1381 for (int j = unaligned; j > 0; j--) {	1366 for (int j = unaligned; j > 0; j--) {

1382 if (UNSAFE.getByte(address++) < 0) {	1367 if (UnsafeUtil.getByte(address++) < 0) {

1383 return unaligned - j;	1368 return unaligned - j;

1384 }	1369 }

1385 }	1370 }

1386	1371

1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).	1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).

1388 // To speed things up further, we're reading longs instead of bytes so we use a mask to	1373 // To speed things up further, we're reading longs instead of bytes so we use a mask to

1389 // determine if any byte in the current long is non-ASCII.	1374 // determine if any byte in the current long is non-ASCII.

1390 remaining -= unaligned;	1375 remaining -= unaligned;

1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0;	1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) = = 0;

1392 address += 8, remaining -= 8) {}	1377 address += 8, remaining -= 8) {}

1393 return maxChars - remaining;	1378 return maxChars - remaining;

1394 }	1379 }

1395	1380

1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) {	1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) {

1397 // Skip past ASCII characters as quickly as possible.	1382 // Skip past ASCII characters as quickly as possible.

1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g);	1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g);

1399 remaining -= skipped;	1384 remaining -= skipped;

1400 offset += skipped;	1385 offset += skipped;

1401	1386

1402 for (;;) {	1387 for (;;) {

1403 // Optimize for interior runs of ASCII bytes.	1388 // Optimize for interior runs of ASCII bytes.

1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?	1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?

1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?	1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?

1406 int byte1 = 0;	1391 int byte1 = 0;

1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0; --remaining) {	1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {

1408 }	1393 }

1409 if (remaining == 0) {	1394 if (remaining == 0) {

1410 return COMPLETE;	1395 return COMPLETE;

1411 }	1396 }

1412 remaining--;	1397 remaining--;

1413	1398

1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.	1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.

1415 if (byte1 < (byte) 0xE0) {	1400 if (byte1 < (byte) 0xE0) {

1416 // Two-byte form (110xxxxx 10xxxxxx)	1401 // Two-byte form (110xxxxx 10xxxxxx)

1417 if (remaining == 0) {	1402 if (remaining == 0) {

1418 // Incomplete sequence	1403 // Incomplete sequence

1419 return byte1;	1404 return byte1;

1420 }	1405 }

1421 remaining--;	1406 remaining--;

1422	1407

1423 // Simultaneously checks for illegal trailing-byte in	1408 // Simultaneously checks for illegal trailing-byte in

1424 // leading position and overlong 2-byte form.	1409 // leading position and overlong 2-byte form.

1425 if (byte1 < (byte) 0xC2	1410 if (byte1 < (byte) 0xC2

1426 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {	1411 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {

1427 return MALFORMED;	1412 return MALFORMED;

1428 }	1413 }

1429 } else if (byte1 < (byte) 0xF0) {	1414 } else if (byte1 < (byte) 0xF0) {

1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)	1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)

1431 if (remaining < 2) {	1416 if (remaining < 2) {

1432 // Incomplete sequence	1417 // Incomplete sequence

1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);	1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);

1434 }	1419 }

1435 remaining -= 2;	1420 remaining -= 2;

1436	1421

1437 final int byte2;	1422 final int byte2;

1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF	1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF

1439 // overlong? 5 most significant bits must not all be zero	1424 // overlong? 5 most significant bits must not all be zero

1440 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1425 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1441 // check for illegal surrogate codepoints	1426 // check for illegal surrogate codepoints

1442 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1427 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1443 // byte3 trailing-byte test	1428 // byte3 trailing-byte test

1444 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {	1429 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {

1445 return MALFORMED;	1430 return MALFORMED;

1446 }	1431 }

1447 } else {	1432 } else {

1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)	1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)

1449 if (remaining < 3) {	1434 if (remaining < 3) {

1450 // Incomplete sequence	1435 // Incomplete sequence

1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);	1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);

1452 }	1437 }

1453 remaining -= 3;	1438 remaining -= 3;

1454	1439

1455 final int byte2;	1440 final int byte2;

1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF	1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF

1457 // Check that 1 <= plane <= 16. Tricky optimized form of:	1442 // Check that 1 <= plane <= 16. Tricky optimized form of:

1458 // if (byte1 > (byte) 0xF4 \|\|	1443 // if (byte1 > (byte) 0xF4 \|\|

1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1461 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1446 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1462 // byte3 trailing-byte test	1447 // byte3 trailing-byte test

1463 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF	1448 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF

1464 // byte4 trailing-byte test	1449 // byte4 trailing-byte test

1465 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {	1450 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {

1466 return MALFORMED;	1451 return MALFORMED;

1467 }	1452 }

1468 }	1453 }

1469 }	1454 }

1470 }	1455 }

1471	1456

1472 private static int partialIsValidUtf8(long address, int remaining) {	1457 private static int partialIsValidUtf8(long address, int remaining) {

1473 // Skip past ASCII characters as quickly as possible.	1458 // Skip past ASCII characters as quickly as possible.

1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);	1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);

1475 address += skipped;	1460 address += skipped;

1476 remaining -= skipped;	1461 remaining -= skipped;

1477	1462

1478 for (;;) {	1463 for (;;) {

1479 // Optimize for interior runs of ASCII bytes.	1464 // Optimize for interior runs of ASCII bytes.

1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?	1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?

1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?	1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?

1482 int byte1 = 0;	1467 int byte1 = 0;

1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --rema ining) {	1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; -- remaining) {

1484 }	1469 }

1485 if (remaining == 0) {	1470 if (remaining == 0) {

1486 return COMPLETE;	1471 return COMPLETE;

1487 }	1472 }

1488 remaining--;	1473 remaining--;

1489	1474

1490 if (byte1 < (byte) 0xE0) {	1475 if (byte1 < (byte) 0xE0) {

1491 // Two-byte form	1476 // Two-byte form

1492	1477

1493 if (remaining == 0) {	1478 if (remaining == 0) {

1494 // Incomplete sequence	1479 // Incomplete sequence

1495 return byte1;	1480 return byte1;

1496 }	1481 }

1497 remaining--;	1482 remaining--;

1498	1483

1499 // Simultaneously checks for illegal trailing-byte in	1484 // Simultaneously checks for illegal trailing-byte in

1500 // leading position and overlong 2-byte form.	1485 // leading position and overlong 2-byte form.

1501 if (byte1 < (byte) 0xC2 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {	1486 if (byte1 < (byte) 0xC2 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF ) {

1502 return MALFORMED;	1487 return MALFORMED;

1503 }	1488 }

1504 } else if (byte1 < (byte) 0xF0) {	1489 } else if (byte1 < (byte) 0xF0) {

1505 // Three-byte form	1490 // Three-byte form

1506	1491

1507 if (remaining < 2) {	1492 if (remaining < 2) {

1508 // Incomplete sequence	1493 // Incomplete sequence

1509 return unsafeIncompleteStateFor(address, byte1, remaining);	1494 return unsafeIncompleteStateFor(address, byte1, remaining);

1510 }	1495 }

1511 remaining -= 2;	1496 remaining -= 2;

1512	1497

1513 final byte byte2 = UNSAFE.getByte(address++);	1498 final byte byte2 = UnsafeUtil.getByte(address++);

1514 if (byte2 > (byte) 0xBF	1499 if (byte2 > (byte) 0xBF

1515 // overlong? 5 most significant bits must not all be zero	1500 // overlong? 5 most significant bits must not all be zero

1516 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1501 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1517 // check for illegal surrogate codepoints	1502 // check for illegal surrogate codepoints

1518 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1503 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1519 // byte3 trailing-byte test	1504 // byte3 trailing-byte test

1520 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {	1505 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {

1521 return MALFORMED;	1506 return MALFORMED;

1522 }	1507 }

1523 } else {	1508 } else {

1524 // Four-byte form	1509 // Four-byte form

1525	1510

1526 if (remaining < 3) {	1511 if (remaining < 3) {

1527 // Incomplete sequence	1512 // Incomplete sequence

1528 return unsafeIncompleteStateFor(address, byte1, remaining);	1513 return unsafeIncompleteStateFor(address, byte1, remaining);

1529 }	1514 }

1530 remaining -= 3;	1515 remaining -= 3;

1531	1516

1532 final byte byte2 = UNSAFE.getByte(address++);	1517 final byte byte2 = UnsafeUtil.getByte(address++);

1533 if (byte2 > (byte) 0xBF	1518 if (byte2 > (byte) 0xBF

1534 // Check that 1 <= plane <= 16. Tricky optimized form of:	1519 // Check that 1 <= plane <= 16. Tricky optimized form of:

1535 // if (byte1 > (byte) 0xF4 \|\|	1520 // if (byte1 > (byte) 0xF4 \|\|

1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1538 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1523 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1539 // byte3 trailing-byte test	1524 // byte3 trailing-byte test

1540 \|\| UNSAFE.getByte(address++) > (byte) 0xBF	1525 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF

1541 // byte4 trailing-byte test	1526 // byte4 trailing-byte test

1542 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {	1527 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {

1543 return MALFORMED;	1528 return MALFORMED;

1544 }	1529 }

1545 }	1530 }

1546 }	1531 }

1547 }	1532 }

1548	1533

1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset,	1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset,

1550 int remaining) {	1535 int remaining) {

1551 switch (remaining) {	1536 switch (remaining) {

1552 case 0: {	1537 case 0: {

1553 return incompleteStateFor(byte1);	1538 return incompleteStateFor(byte1);

1554 }	1539 }

1555 case 1: {	1540 case 1: {

1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset));	1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));

1557 }	1542 }

1558 case 2: {	1543 case 2: {

1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset),	1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset),

1560 UNSAFE.getByte(bytes, offset + 1));	1545 UnsafeUtil.getByte(bytes, offset + 1));

1561 }	1546 }

1562 default: {	1547 default: {

1563 throw new AssertionError();	1548 throw new AssertionError();

1564 }	1549 }

1565 }	1550 }

1566 }	1551 }

1567	1552

1568 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) {	1553 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) {

1569 switch (remaining) {	1554 switch (remaining) {

1570 case 0: {	1555 case 0: {

1571 return incompleteStateFor(byte1);	1556 return incompleteStateFor(byte1);

1572 }	1557 }

1573 case 1: {	1558 case 1: {

1574 return incompleteStateFor(byte1, UNSAFE.getByte(address));	1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address));

1575 }	1560 }

1576 case 2: {	1561 case 2: {

1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getBy te(address + 1));	1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address),

	1563 UnsafeUtil.getByte(address + 1));

1578 }	1564 }

1579 default: {	1565 default: {

1580 throw new AssertionError();	1566 throw new AssertionError();

1581 }	1567 }

1582 }	1568 }

1583 }	1569 }

1584

1585 /**

1586 * Gets the field with the given name within the class, or {@code null} if n ot found. If

1587 * found, the field is made accessible.

1588 */

1589 private static Field field(Class<?> clazz, String fieldName) {

1590 Field field;

1591 try {

1592 field = clazz.getDeclaredField(fieldName);

1593 field.setAccessible(true);

1594 } catch (Throwable t) {

1595 // Failed to access the fields.

1596 field = null;

1597 }

1598 logger.log(Level.FINEST, "{0}.{1}: {2}",

1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available" : "unavailable")});

1600 return field;

1601 }

1602

1603 /**

1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.mis c.Unsafe} is not

1605 * available.

1606 */

1607 private static long fieldOffset(Field field) {

1608 return field == null \|\| UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(fie ld);

1609 }

1610

1611 /**

1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Uns afe} is not

1613 * available.

1614 */

1615 private static <T> int byteArrayBaseOffset() {

1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class);

1617 }

1618

1619 /**

1620 * Gets the offset of the {@code address} field of the given direct {@link B yteBuffer}.

1621 */

1622 private static long addressOffset(ByteBuffer buffer) {

1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET);

1624 }

1625

1626 /**

1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not availab le on this

1628 * platform.

1629 */

1630 private static sun.misc.Unsafe getUnsafe() {

1631 sun.misc.Unsafe unsafe = null;

1632 try {

1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun .misc.Unsafe>() {

1634 @Override

1635 public sun.misc.Unsafe run() throws Exception {

1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class;

1637

1638 // Check that this platform supports all of the required unsafe meth ods.

1639 checkRequiredMethods(k);

1640

1641 for (Field f : k.getDeclaredFields()) {

1642 f.setAccessible(true);

1643 Object x = f.get(null);

1644 if (k.isInstance(x)) {

1645 return k.cast(x);

1646 }

1647 }

1648 // The sun.misc.Unsafe field does not exist.

1649 return null;

1650 }

1651 });

1652 } catch (Throwable e) {

1653 // Catching Throwable here due to the fact that Google AppEngine raises NoClassDefFoundError

1654 // for Unsafe.

1655 }

1656

1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}",

1658 unsafe != null ? "available" : "unavailable");

1659 return unsafe;

1660 }

1661

1662 /**

1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are availab le on this platform.

1664 */

1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz)

1666 throws NoSuchMethodException, SecurityException {

1667 // Needed for Unsafe byte[] access

1668 clazz.getMethod("arrayBaseOffset", Class.class);

1669 clazz.getMethod("getByte", Object.class, long.class);

1670 clazz.getMethod("putByte", Object.class, long.class, byte.class);

1671 clazz.getMethod("getLong", Object.class, long.class);

1672

1673 // Needed for Unsafe Direct ByteBuffer access

1674 clazz.getMethod("objectFieldOffset", Field.class);

1675 clazz.getMethod("getByte", long.class);

1676 clazz.getMethod("getLong", Object.class, long.class);

1677 clazz.getMethod("putByte", long.class, byte.class);

1678 clazz.getMethod("getLong", long.class);

1679 }

1680 }	1570 }

1681	1571

1682 private Utf8() {}	1572 private Utf8() {}

1683 }	1573 }

OLD	NEW