third_party/protobuf/java/core/src/main/java/com/google/protobuf/Utf8.java - Issue 2600753002: Reverts third_party/protobuf: Update to HEAD (f52e188fe4)

Side by Side Diff: third_party/protobuf/java/core/src/main/java/com/google/protobuf/Utf8.java

Issue 2600753002: Reverts third_party/protobuf: Update to HEAD (f52e188fe4) (Closed)

Patch Set: Created 3 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/protobuf/java/core/src/main/java/com/google/protobuf/UnsafeUtil.java ('k') | third_party/protobuf/java/core/src/main/java/com/google/protobuf/WireFormat.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Protocol Buffers - Google's data interchange format	1 // Protocol Buffers - Google's data interchange format

2 // Copyright 2008 Google Inc. All rights reserved.	2 // Copyright 2008 Google Inc. All rights reserved.

3 // https://developers.google.com/protocol-buffers/	3 // https://developers.google.com/protocol-buffers/

4 //	4 //

5 // Redistribution and use in source and binary forms, with or without	5 // Redistribution and use in source and binary forms, with or without

6 // modification, are permitted provided that the following conditions are	6 // modification, are permitted provided that the following conditions are

7 // met:	7 // met:

8 //	8 //

9 // * Redistributions of source code must retain the above copyright	9 // * Redistributions of source code must retain the above copyright

10 // notice, this list of conditions and the following disclaimer.	10 // notice, this list of conditions and the following disclaimer.

(...skipping 12 matching lines...) Expand all Loading...
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,	23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

30	30

31 package com.google.protobuf;	31 package com.google.protobuf;

32	32

33 import static com.google.protobuf.UnsafeUtil.addressOffset;

34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset;

35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;

36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;

37 import static java.lang.Character.MAX_SURROGATE;	33 import static java.lang.Character.MAX_SURROGATE;

38 import static java.lang.Character.MIN_SURROGATE;	34 import static java.lang.Character.MIN_SURROGATE;

39 import static java.lang.Character.isSurrogatePair;	35 import static java.lang.Character.isSurrogatePair;

40 import static java.lang.Character.toCodePoint;	36 import static java.lang.Character.toCodePoint;

41	37

	38 import java.lang.reflect.Field;

	39 import java.nio.Buffer;

42 import java.nio.ByteBuffer;	40 import java.nio.ByteBuffer;

	41 import java.security.AccessController;

	42 import java.security.PrivilegedExceptionAction;

	43 import java.util.logging.Level;

	44 import java.util.logging.Logger;

43	45

44 /**	46 /**

45 * A set of low-level, high-performance static utility methods related	47 * A set of low-level, high-performance static utility methods related

46 * to the UTF-8 character encoding. This class has no dependencies	48 * to the UTF-8 character encoding. This class has no dependencies

47 * outside of the core JDK libraries.	49 * outside of the core JDK libraries.

48 *	50 *

49 * <p>There are several variants of UTF-8. The one implemented by	51 * <p>There are several variants of UTF-8. The one implemented by

50 * this class is the restricted definition of UTF-8 introduced in	52 * this class is the restricted definition of UTF-8 introduced in

51 * Unicode 3.1, which mandates the rejection of "overlong" byte	53 * Unicode 3.1, which mandates the rejection of "overlong" byte

52 * sequences as well as rejection of 3-byte surrogate codepoint byte	54 * sequences as well as rejection of 3-byte surrogate codepoint byte

(...skipping 17 matching lines...) Expand all Loading...
70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is	72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is

71 * well-formed in the absence of additional input, or if the byte sequence	73 * well-formed in the absence of additional input, or if the byte sequence

72 * apparently terminated in the middle of a character, an opaque integer	74 * apparently terminated in the middle of a character, an opaque integer

73 * "state" value containing enough information to decode the character when	75 * "state" value containing enough information to decode the character when

74 * passed to a subsequent invocation of a partial decoding method.	76 * passed to a subsequent invocation of a partial decoding method.

75 *	77 *

76 * @author martinrb@google.com (Martin Buchholz)	78 * @author martinrb@google.com (Martin Buchholz)

77 */	79 */

78 // TODO(nathanmittler): Copy changes in this class back to Guava	80 // TODO(nathanmittler): Copy changes in this class back to Guava

79 final class Utf8 {	81 final class Utf8 {

	82 private static final Logger logger = Logger.getLogger(Utf8.class.getName());

80	83

81 /**	84 /**

82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations	85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl ementations

83 * depending on what is available on the platform. The processor is the platfo rm-optimized	86 * depending on what is available on the platform. The processor is the platfo rm-optimized

84 * delegate for which all methods are delegated directly to.	87 * delegate for which all methods are delegated directly to.

85 */	88 */

86 private static final Processor processor =	89 private static final Processor processor =

87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( );	90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor( );

88	91

89 /**	92 /**

(...skipping 137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
227 default:	230 default:

228 throw new AssertionError();	231 throw new AssertionError();

229 }	232 }

230 }	233 }

231	234

232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw	235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi fication to throw

233 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can	236 // a protocol buffer local exception. This exception is then caught in CodedOu tputStream so it can

234 // fallback to more lenient behavior.	237 // fallback to more lenient behavior.

235	238

236 static class UnpairedSurrogateException extends IllegalArgumentException {	239 static class UnpairedSurrogateException extends IllegalArgumentException {

237 UnpairedSurrogateException(int index, int length) {	240 private UnpairedSurrogateException(int index, int length) {

238 super("Unpaired surrogate at index " + index + " of " + length);	241 super("Unpaired surrogate at index " + index + " of " + length);

239 }	242 }

240 }	243 }

241	244

242 /**	245 /**

243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,	246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,

244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in	247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in

245 * both time and space.	248 * both time and space.

246 *	249 *

247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired	250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT F-16 (unpaired

(...skipping 733 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
981 }	984 }

982 }	985 }

983 }	986 }

984 }	987 }

985 }	988 }

986	989

987 /**	990 /**

988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance.	991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro ve performance.

989 */	992 */

990 static final class UnsafeProcessor extends Processor {	993 static final class UnsafeProcessor extends Processor {

	994 private static final sun.misc.Unsafe UNSAFE = getUnsafe();

	995 private static final long BUFFER_ADDRESS_OFFSET =

	996 fieldOffset(field(Buffer.class, "address"));

	997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset();

	998

	999 /**

	1000 * We only use Unsafe operations if we have access to direct {@link ByteBuff er}'s address

	1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong()) .

	1002 */

	1003 private static final boolean AVAILABLE =

	1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0;

	1005

991 /**	1006 /**

992 * Indicates whether or not all required unsafe operations are supported on this platform.	1007 * Indicates whether or not all required unsafe operations are supported on this platform.

993 */	1008 */

994 static boolean isAvailable() {	1009 static boolean isAvailable() {

995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();	1010 return AVAILABLE;

996 }	1011 }

997	1012

998 @Override	1013 @Override

999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) {	1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l imit) {

1000 if ((index \| limit \| bytes.length - limit) < 0) {	1015 if ((index \| limit \| bytes.length - limit) < 0) {

1001 throw new ArrayIndexOutOfBoundsException(	1016 throw new ArrayIndexOutOfBoundsException(

1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit));	1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i ndex, limit));

1003 }	1018 }

1004 long offset = getArrayBaseOffset() + index;	1019 long offset = ARRAY_BASE_OFFSET + index;

1005 final long offsetLimit = getArrayBaseOffset() + limit;	1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit;

1006 if (state != COMPLETE) {	1021 if (state != COMPLETE) {

1007 // The previous decoding operation was incomplete (or malformed).	1022 // The previous decoding operation was incomplete (or malformed).

1008 // We look for a well-formed sequence consisting of bytes from	1023 // We look for a well-formed sequence consisting of bytes from

1009 // the previous decoding operation (stored in state) together	1024 // the previous decoding operation (stored in state) together

1010 // with bytes from the array slice.	1025 // with bytes from the array slice.

1011 //	1026 //

1012 // We expect such "straddler characters" to be rare.	1027 // We expect such "straddler characters" to be rare.

1013	1028

1014 if (offset >= offsetLimit) { // No bytes? No progress.	1029 if (offset >= offsetLimit) { // No bytes? No progress.

1015 return state;	1030 return state;

1016 }	1031 }

1017 int byte1 = (byte) state;	1032 int byte1 = (byte) state;

1018 // byte1 is never ASCII.	1033 // byte1 is never ASCII.

1019 if (byte1 < (byte) 0xE0) {	1034 if (byte1 < (byte) 0xE0) {

1020 // two-byte form	1035 // two-byte form

1021	1036

1022 // Simultaneously checks for illegal trailing-byte in	1037 // Simultaneously checks for illegal trailing-byte in

1023 // leading position and overlong 2-byte form.	1038 // leading position and overlong 2-byte form.

1024 if (byte1 < (byte) 0xC2	1039 if (byte1 < (byte) 0xC2

1025 // byte2 trailing-byte test	1040 // byte2 trailing-byte test

1026 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {	1041 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {

1027 return MALFORMED;	1042 return MALFORMED;

1028 }	1043 }

1029 } else if (byte1 < (byte) 0xF0) {	1044 } else if (byte1 < (byte) 0xF0) {

1030 // three-byte form	1045 // three-byte form

1031	1046

1032 // Get byte2 from saved state or array	1047 // Get byte2 from saved state or array

1033 int byte2 = (byte) ~(state >> 8);	1048 int byte2 = (byte) ~(state >> 8);

1034 if (byte2 == 0) {	1049 if (byte2 == 0) {

1035 byte2 = UnsafeUtil.getByte(bytes, offset++);	1050 byte2 = UNSAFE.getByte(bytes, offset++);

1036 if (offset >= offsetLimit) {	1051 if (offset >= offsetLimit) {

1037 return incompleteStateFor(byte1, byte2);	1052 return incompleteStateFor(byte1, byte2);

1038 }	1053 }

1039 }	1054 }

1040 if (byte2 > (byte) 0xBF	1055 if (byte2 > (byte) 0xBF

1041 // overlong? 5 most significant bits must not all be zero	1056 // overlong? 5 most significant bits must not all be zero

1042 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1057 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1043 // illegal surrogate codepoint?	1058 // illegal surrogate codepoint?

1044 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1059 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1045 // byte3 trailing-byte test	1060 // byte3 trailing-byte test

1046 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {	1061 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {

1047 return MALFORMED;	1062 return MALFORMED;

1048 }	1063 }

1049 } else {	1064 } else {

1050 // four-byte form	1065 // four-byte form

1051	1066

1052 // Get byte2 and byte3 from saved state or array	1067 // Get byte2 and byte3 from saved state or array

1053 int byte2 = (byte) ~(state >> 8);	1068 int byte2 = (byte) ~(state >> 8);

1054 int byte3 = 0;	1069 int byte3 = 0;

1055 if (byte2 == 0) {	1070 if (byte2 == 0) {

1056 byte2 = UnsafeUtil.getByte(bytes, offset++);	1071 byte2 = UNSAFE.getByte(bytes, offset++);

1057 if (offset >= offsetLimit) {	1072 if (offset >= offsetLimit) {

1058 return incompleteStateFor(byte1, byte2);	1073 return incompleteStateFor(byte1, byte2);

1059 }	1074 }

1060 } else {	1075 } else {

1061 byte3 = (byte) (state >> 16);	1076 byte3 = (byte) (state >> 16);

1062 }	1077 }

1063 if (byte3 == 0) {	1078 if (byte3 == 0) {

1064 byte3 = UnsafeUtil.getByte(bytes, offset++);	1079 byte3 = UNSAFE.getByte(bytes, offset++);

1065 if (offset >= offsetLimit) {	1080 if (offset >= offsetLimit) {

1066 return incompleteStateFor(byte1, byte2, byte3);	1081 return incompleteStateFor(byte1, byte2, byte3);

1067 }	1082 }

1068 }	1083 }

1069	1084

1070 // If we were called with state == MALFORMED, then byte1 is 0xFF,	1085 // If we were called with state == MALFORMED, then byte1 is 0xFF,

1071 // which never occurs in well-formed UTF-8, and so we will return	1086 // which never occurs in well-formed UTF-8, and so we will return

1072 // MALFORMED again below.	1087 // MALFORMED again below.

1073	1088

1074 if (byte2 > (byte) 0xBF	1089 if (byte2 > (byte) 0xBF

1075 // Check that 1 <= plane <= 16. Tricky optimized form of:	1090 // Check that 1 <= plane <= 16. Tricky optimized form of:

1076 // if (byte1 > (byte) 0xF4 \|\|	1091 // if (byte1 > (byte) 0xF4 \|\|

1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1079 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1094 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1080 // byte3 trailing-byte test	1095 // byte3 trailing-byte test

1081 \|\| byte3 > (byte) 0xBF	1096 \|\| byte3 > (byte) 0xBF

1082 // byte4 trailing-byte test	1097 // byte4 trailing-byte test

1083 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {	1098 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {

1084 return MALFORMED;	1099 return MALFORMED;

1085 }	1100 }

1086 }	1101 }

1087 }	1102 }

1088	1103

1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));	1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));

1090 }	1105 }

1091	1106

1092 @Override	1107 @Override

1093 int partialIsValidUtf8Direct(	1108 int partialIsValidUtf8Direct(

(...skipping 18 matching lines...) Expand all Loading...
1112	1127

1113 final int byte1 = (byte) state;	1128 final int byte1 = (byte) state;

1114 // byte1 is never ASCII.	1129 // byte1 is never ASCII.

1115 if (byte1 < (byte) 0xE0) {	1130 if (byte1 < (byte) 0xE0) {

1116 // two-byte form	1131 // two-byte form

1117	1132

1118 // Simultaneously checks for illegal trailing-byte in	1133 // Simultaneously checks for illegal trailing-byte in

1119 // leading position and overlong 2-byte form.	1134 // leading position and overlong 2-byte form.

1120 if (byte1 < (byte) 0xC2	1135 if (byte1 < (byte) 0xC2

1121 // byte2 trailing-byte test	1136 // byte2 trailing-byte test

1122 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {	1137 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {

1123 return MALFORMED;	1138 return MALFORMED;

1124 }	1139 }

1125 } else if (byte1 < (byte) 0xF0) {	1140 } else if (byte1 < (byte) 0xF0) {

1126 // three-byte form	1141 // three-byte form

1127	1142

1128 // Get byte2 from saved state or array	1143 // Get byte2 from saved state or array

1129 int byte2 = (byte) ~(state >> 8);	1144 int byte2 = (byte) ~(state >> 8);

1130 if (byte2 == 0) {	1145 if (byte2 == 0) {

1131 byte2 = UnsafeUtil.getByte(address++);	1146 byte2 = UNSAFE.getByte(address++);

1132 if (address >= addressLimit) {	1147 if (address >= addressLimit) {

1133 return incompleteStateFor(byte1, byte2);	1148 return incompleteStateFor(byte1, byte2);

1134 }	1149 }

1135 }	1150 }

1136 if (byte2 > (byte) 0xBF	1151 if (byte2 > (byte) 0xBF

1137 // overlong? 5 most significant bits must not all be zero	1152 // overlong? 5 most significant bits must not all be zero

1138 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1153 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1139 // illegal surrogate codepoint?	1154 // illegal surrogate codepoint?

1140 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1155 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1141 // byte3 trailing-byte test	1156 // byte3 trailing-byte test

1142 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {	1157 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {

1143 return MALFORMED;	1158 return MALFORMED;

1144 }	1159 }

1145 } else {	1160 } else {

1146 // four-byte form	1161 // four-byte form

1147	1162

1148 // Get byte2 and byte3 from saved state or array	1163 // Get byte2 and byte3 from saved state or array

1149 int byte2 = (byte) ~(state >> 8);	1164 int byte2 = (byte) ~(state >> 8);

1150 int byte3 = 0;	1165 int byte3 = 0;

1151 if (byte2 == 0) {	1166 if (byte2 == 0) {

1152 byte2 = UnsafeUtil.getByte(address++);	1167 byte2 = UNSAFE.getByte(address++);

1153 if (address >= addressLimit) {	1168 if (address >= addressLimit) {

1154 return incompleteStateFor(byte1, byte2);	1169 return incompleteStateFor(byte1, byte2);

1155 }	1170 }

1156 } else {	1171 } else {

1157 byte3 = (byte) (state >> 16);	1172 byte3 = (byte) (state >> 16);

1158 }	1173 }

1159 if (byte3 == 0) {	1174 if (byte3 == 0) {

1160 byte3 = UnsafeUtil.getByte(address++);	1175 byte3 = UNSAFE.getByte(address++);

1161 if (address >= addressLimit) {	1176 if (address >= addressLimit) {

1162 return incompleteStateFor(byte1, byte2, byte3);	1177 return incompleteStateFor(byte1, byte2, byte3);

1163 }	1178 }

1164 }	1179 }

1165	1180

1166 // If we were called with state == MALFORMED, then byte1 is 0xFF,	1181 // If we were called with state == MALFORMED, then byte1 is 0xFF,

1167 // which never occurs in well-formed UTF-8, and so we will return	1182 // which never occurs in well-formed UTF-8, and so we will return

1168 // MALFORMED again below.	1183 // MALFORMED again below.

1169	1184

1170 if (byte2 > (byte) 0xBF	1185 if (byte2 > (byte) 0xBF

1171 // Check that 1 <= plane <= 16. Tricky optimized form of:	1186 // Check that 1 <= plane <= 16. Tricky optimized form of:

1172 // if (byte1 > (byte) 0xF4 \|\|	1187 // if (byte1 > (byte) 0xF4 \|\|

1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1175 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1190 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1176 // byte3 trailing-byte test	1191 // byte3 trailing-byte test

1177 \|\| byte3 > (byte) 0xBF	1192 \|\| byte3 > (byte) 0xBF

1178 // byte4 trailing-byte test	1193 // byte4 trailing-byte test

1179 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {	1194 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {

1180 return MALFORMED;	1195 return MALFORMED;

1181 }	1196 }

1182 }	1197 }

1183 }	1198 }

1184	1199

1185 return partialIsValidUtf8(address, (int) (addressLimit - address));	1200 return partialIsValidUtf8(address, (int) (addressLimit - address));

1186 }	1201 }

1187	1202

1188 @Override	1203 @Override

1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) {	1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi nal int length) {

1190 long outIx = getArrayBaseOffset() + offset;	1205 long outIx = ARRAY_BASE_OFFSET + offset;

1191 final long outLimit = outIx + length;	1206 final long outLimit = outIx + length;

1192 final int inLimit = in.length();	1207 final int inLimit = in.length();

1193 if (inLimit > length \|\| out.length - length < offset) {	1208 if (inLimit > length \|\| out.length - length < offset) {

1194 // Not even enough room for an ASCII-encoded string.	1209 // Not even enough room for an ASCII-encoded string.

1195 throw new ArrayIndexOutOfBoundsException(	1210 throw new ArrayIndexOutOfBoundsException(

1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));	1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));

1197 }	1212 }

1198	1213

1199 // Designed to take advantage of	1214 // Designed to take advantage of

1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination	1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination

1201 int inIx = 0;	1216 int inIx = 0;

1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {	1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {

1203 UnsafeUtil.putByte(out, outIx++, (byte) c);	1218 UNSAFE.putByte(out, outIx++, (byte) c);

1204 }	1219 }

1205 if (inIx == inLimit) {	1220 if (inIx == inLimit) {

1206 // We're done, it was ASCII encoded.	1221 // We're done, it was ASCII encoded.

1207 return (int) (outIx - getArrayBaseOffset());	1222 return (int) (outIx - ARRAY_BASE_OFFSET);

1208 }	1223 }

1209	1224

1210 for (char c; inIx < inLimit; ++inIx) {	1225 for (char c; inIx < inLimit; ++inIx) {

1211 c = in.charAt(inIx);	1226 c = in.charAt(inIx);

1212 if (c < 0x80 && outIx < outLimit) {	1227 if (c < 0x80 && outIx < outLimit) {

1213 UnsafeUtil.putByte(out, outIx++, (byte) c);	1228 UNSAFE.putByte(out, outIx++, (byte) c);

1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes	1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes

1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) \| (c >>> 6)));	1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) \| (c >>> 6)));

1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));	1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));

1217 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {	1232 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {

1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s	1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s

1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) \| (c >>> 12)));	1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) \| (c >>> 12)));

1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));	1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));

1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));	1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & c)));

1222 } else if (outIx <= outLimit - 4L) {	1237 } else if (outIx <= outLimit - 4L) {

1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8	1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8

1224 // bytes	1239 // bytes

1225 final char low;	1240 final char low;

1226 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {	1241 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {

1227 throw new UnpairedSurrogateException((inIx - 1), inLimit);	1242 throw new UnpairedSurrogateException((inIx - 1), inLimit);

1228 }	1243 }

1229 int codePoint = toCodePoint(c, low);	1244 int codePoint = toCodePoint(c, low);

1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) \| (codePoint >>> 1 8)));	1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) \| (codePoint >>> 18))) ;

1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12))));	1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12) )));

1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6))));	1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6)) ));

1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 \| (0x3F & codePoint)));	1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 \| (0x3F & codePoint)));

1234 } else {	1249 } else {

1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)	1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)

1236 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {	1251 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {

1237 // We are surrogates and we're not a surrogate pair.	1252 // We are surrogates and we're not a surrogate pair.

1238 throw new UnpairedSurrogateException(inIx, inLimit);	1253 throw new UnpairedSurrogateException(inIx, inLimit);

1239 }	1254 }

1240 // Not enough space in the output buffer.	1255 // Not enough space in the output buffer.

1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);	1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);

1242 }	1257 }

1243 }	1258 }

1244	1259

1245 // All bytes have been encoded.	1260 // All bytes have been encoded.

1246 return (int) (outIx - getArrayBaseOffset());	1261 return (int) (outIx - ARRAY_BASE_OFFSET);

1247 }	1262 }

1248	1263

1249 @Override	1264 @Override

1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {	1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) {

1251 final long address = addressOffset(out);	1266 final long address = addressOffset(out);

1252 long outIx = address + out.position();	1267 long outIx = address + out.position();

1253 final long outLimit = address + out.limit();	1268 final long outLimit = address + out.limit();

1254 final int inLimit = in.length();	1269 final int inLimit = in.length();

1255 if (inLimit > outLimit - outIx) {	1270 if (inLimit > outLimit - outIx) {

1256 // Not even enough room for an ASCII-encoded string.	1271 // Not even enough room for an ASCII-encoded string.

1257 throw new ArrayIndexOutOfBoundsException(	1272 throw new ArrayIndexOutOfBoundsException(

1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t());	1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi t());

1259 }	1274 }

1260	1275

1261 // Designed to take advantage of	1276 // Designed to take advantage of

1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination	1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination

1263 int inIx = 0;	1278 int inIx = 0;

1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {	1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {

1265 UnsafeUtil.putByte(outIx++, (byte) c);	1280 UNSAFE.putByte(outIx++, (byte) c);

1266 }	1281 }

1267 if (inIx == inLimit) {	1282 if (inIx == inLimit) {

1268 // We're done, it was ASCII encoded.	1283 // We're done, it was ASCII encoded.

1269 out.position((int) (outIx - address));	1284 out.position((int) (outIx - address));

1270 return;	1285 return;

1271 }	1286 }

1272	1287

1273 for (char c; inIx < inLimit; ++inIx) {	1288 for (char c; inIx < inLimit; ++inIx) {

1274 c = in.charAt(inIx);	1289 c = in.charAt(inIx);

1275 if (c < 0x80 && outIx < outLimit) {	1290 if (c < 0x80 && outIx < outLimit) {

1276 UnsafeUtil.putByte(outIx++, (byte) c);	1291 UNSAFE.putByte(outIx++, (byte) c);

1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes	1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes

1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) \| (c >>> 6)));	1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) \| (c >>> 6)));

1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));	1294 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));

1280 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {	1295 } else if ((c < MIN_SURROGATE \|\| MAX_SURROGATE < c) && outIx <= outLimit - 3L) {

1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s	1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte s

1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) \| (c >>> 12)));	1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) \| (c >>> 12)));

1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));	1298 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));

1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));	1299 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & c)));

1285 } else if (outIx <= outLimit - 4L) {	1300 } else if (outIx <= outLimit - 4L) {

1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8	1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 b its, four UTF-8

1287 // bytes	1302 // bytes

1288 final char low;	1303 final char low;

1289 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {	1304 if (inIx + 1 == inLimit \|\| !isSurrogatePair(c, (low = in.charAt(++inIx )))) {

1290 throw new UnpairedSurrogateException((inIx - 1), inLimit);	1305 throw new UnpairedSurrogateException((inIx - 1), inLimit);

1291 }	1306 }

1292 int codePoint = toCodePoint(c, low);	1307 int codePoint = toCodePoint(c, low);

1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) \| (codePoint >>> 18)));	1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) \| (codePoint >>> 18)));

1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12)) ));	1309 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12))));

1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6))) );	1310 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6))));

1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 \| (0x3F & codePoint)));	1311 UNSAFE.putByte(outIx++, (byte) (0x80 \| (0x3F & codePoint)));

1297 } else {	1312 } else {

1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)	1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)

1299 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {	1314 && (inIx + 1 == inLimit \|\| !isSurrogatePair(c, in.charAt(inIx + 1) ))) {

1300 // We are surrogates and we're not a surrogate pair.	1315 // We are surrogates and we're not a surrogate pair.

1301 throw new UnpairedSurrogateException(inIx, inLimit);	1316 throw new UnpairedSurrogateException(inIx, inLimit);

1302 }	1317 }

1303 // Not enough space in the output buffer.	1318 // Not enough space in the output buffer.

1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);	1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);

1305 }	1320 }

1306 }	1321 }

(...skipping 20 matching lines...) Expand all Loading...
1327 return 0;	1342 return 0;

1328 }	1343 }

1329	1344

1330 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.	1345 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.

1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that	1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that

1332 // the index (relative to the start of the array) is also 8-byte aligned. We do this by	1347 // the index (relative to the start of the array) is also 8-byte aligned. We do this by

1333 // ANDing the index with 7 to determine the number of bytes that need to b e read before	1348 // ANDing the index with 7 to determine the number of bytes that need to b e read before

1334 // we're 8-byte aligned.	1349 // we're 8-byte aligned.

1335 final int unaligned = (int) offset & 7;	1350 final int unaligned = (int) offset & 7;

1336 for (int j = unaligned; j > 0; j--) {	1351 for (int j = unaligned; j > 0; j--) {

1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) {	1352 if (UNSAFE.getByte(bytes, offset++) < 0) {

1338 return unaligned - j;	1353 return unaligned - j;

1339 }	1354 }

1340 }	1355 }

1341	1356

1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).	1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).

1343 // To speed things up further, we're reading longs instead of bytes so we use a mask to	1358 // To speed things up further, we're reading longs instead of bytes so we use a mask to

1344 // determine if any byte in the current long is non-ASCII.	1359 // determine if any byte in the current long is non-ASCII.

1345 remaining -= unaligned;	1360 remaining -= unaligned;

1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_L ONG) == 0;	1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG) == 0;

1347 offset += 8, remaining -= 8) {}	1362 offset += 8, remaining -= 8) {}

1348 return maxChars - remaining;	1363 return maxChars - remaining;

1349 }	1364 }

1350	1365

1351 /**	1366 /**

1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the	1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep t that it uses the

1353 * most efficient method available to the platform.	1368 * most efficient method available to the platform.

1354 */	1369 */

1355 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) {	1370 private static int unsafeEstimateConsecutiveAscii(long address, final int ma xChars) {

1356 int remaining = maxChars;	1371 int remaining = maxChars;

1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {	1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {

1358 // Don't bother with small strings.	1373 // Don't bother with small strings.

1359 return 0;	1374 return 0;

1360 }	1375 }

1361	1376

1362 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.	1377 // Read bytes until 8-byte aligned so that we can read longs in the loop b elow.

1363 // We do this by ANDing the address with 7 to determine the number of byte s that need to	1378 // We do this by ANDing the address with 7 to determine the number of byte s that need to

1364 // be read before we're 8-byte aligned.	1379 // be read before we're 8-byte aligned.

1365 final int unaligned = (int) address & 7;	1380 final int unaligned = (int) address & 7;

1366 for (int j = unaligned; j > 0; j--) {	1381 for (int j = unaligned; j > 0; j--) {

1367 if (UnsafeUtil.getByte(address++) < 0) {	1382 if (UNSAFE.getByte(address++) < 0) {

1368 return unaligned - j;	1383 return unaligned - j;

1369 }	1384 }

1370 }	1385 }

1371	1386

1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).	1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII ).

1373 // To speed things up further, we're reading longs instead of bytes so we use a mask to	1388 // To speed things up further, we're reading longs instead of bytes so we use a mask to

1374 // determine if any byte in the current long is non-ASCII.	1389 // determine if any byte in the current long is non-ASCII.

1375 remaining -= unaligned;	1390 remaining -= unaligned;

1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) = = 0;	1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0;

1377 address += 8, remaining -= 8) {}	1392 address += 8, remaining -= 8) {}

1378 return maxChars - remaining;	1393 return maxChars - remaining;

1379 }	1394 }

1380	1395

1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) {	1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r emaining) {

1382 // Skip past ASCII characters as quickly as possible.	1397 // Skip past ASCII characters as quickly as possible.

1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g);	1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin g);

1384 remaining -= skipped;	1399 remaining -= skipped;

1385 offset += skipped;	1400 offset += skipped;

1386	1401

1387 for (;;) {	1402 for (;;) {

1388 // Optimize for interior runs of ASCII bytes.	1403 // Optimize for interior runs of ASCII bytes.

1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?	1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?

1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?	1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?

1391 int byte1 = 0;	1406 int byte1 = 0;

1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {	1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0; --remaining) {

1393 }	1408 }

1394 if (remaining == 0) {	1409 if (remaining == 0) {

1395 return COMPLETE;	1410 return COMPLETE;

1396 }	1411 }

1397 remaining--;	1412 remaining--;

1398	1413

1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.	1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.

1400 if (byte1 < (byte) 0xE0) {	1415 if (byte1 < (byte) 0xE0) {

1401 // Two-byte form (110xxxxx 10xxxxxx)	1416 // Two-byte form (110xxxxx 10xxxxxx)

1402 if (remaining == 0) {	1417 if (remaining == 0) {

1403 // Incomplete sequence	1418 // Incomplete sequence

1404 return byte1;	1419 return byte1;

1405 }	1420 }

1406 remaining--;	1421 remaining--;

1407	1422

1408 // Simultaneously checks for illegal trailing-byte in	1423 // Simultaneously checks for illegal trailing-byte in

1409 // leading position and overlong 2-byte form.	1424 // leading position and overlong 2-byte form.

1410 if (byte1 < (byte) 0xC2	1425 if (byte1 < (byte) 0xC2

1411 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {	1426 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {

1412 return MALFORMED;	1427 return MALFORMED;

1413 }	1428 }

1414 } else if (byte1 < (byte) 0xF0) {	1429 } else if (byte1 < (byte) 0xF0) {

1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)	1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)

1416 if (remaining < 2) {	1431 if (remaining < 2) {

1417 // Incomplete sequence	1432 // Incomplete sequence

1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);	1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);

1419 }	1434 }

1420 remaining -= 2;	1435 remaining -= 2;

1421	1436

1422 final int byte2;	1437 final int byte2;

1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF	1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF

1424 // overlong? 5 most significant bits must not all be zero	1439 // overlong? 5 most significant bits must not all be zero

1425 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1440 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1426 // check for illegal surrogate codepoints	1441 // check for illegal surrogate codepoints

1427 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1442 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1428 // byte3 trailing-byte test	1443 // byte3 trailing-byte test

1429 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {	1444 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {

1430 return MALFORMED;	1445 return MALFORMED;

1431 }	1446 }

1432 } else {	1447 } else {

1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)	1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)

1434 if (remaining < 3) {	1449 if (remaining < 3) {

1435 // Incomplete sequence	1450 // Incomplete sequence

1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);	1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);

1437 }	1452 }

1438 remaining -= 3;	1453 remaining -= 3;

1439	1454

1440 final int byte2;	1455 final int byte2;

1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF	1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF

1442 // Check that 1 <= plane <= 16. Tricky optimized form of:	1457 // Check that 1 <= plane <= 16. Tricky optimized form of:

1443 // if (byte1 > (byte) 0xF4 \|\|	1458 // if (byte1 > (byte) 0xF4 \|\|

1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1446 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1461 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1447 // byte3 trailing-byte test	1462 // byte3 trailing-byte test

1448 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF	1463 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF

1449 // byte4 trailing-byte test	1464 // byte4 trailing-byte test

1450 \|\| UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {	1465 \|\| UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {

1451 return MALFORMED;	1466 return MALFORMED;

1452 }	1467 }

1453 }	1468 }

1454 }	1469 }

1455 }	1470 }

1456	1471

1457 private static int partialIsValidUtf8(long address, int remaining) {	1472 private static int partialIsValidUtf8(long address, int remaining) {

1458 // Skip past ASCII characters as quickly as possible.	1473 // Skip past ASCII characters as quickly as possible.

1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);	1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);

1460 address += skipped;	1475 address += skipped;

1461 remaining -= skipped;	1476 remaining -= skipped;

1462	1477

1463 for (;;) {	1478 for (;;) {

1464 // Optimize for interior runs of ASCII bytes.	1479 // Optimize for interior runs of ASCII bytes.

1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?	1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t hreshold?

1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?	1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mod e?

1467 int byte1 = 0;	1482 int byte1 = 0;

1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; -- remaining) {	1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --rema ining) {

1469 }	1484 }

1470 if (remaining == 0) {	1485 if (remaining == 0) {

1471 return COMPLETE;	1486 return COMPLETE;

1472 }	1487 }

1473 remaining--;	1488 remaining--;

1474	1489

1475 if (byte1 < (byte) 0xE0) {	1490 if (byte1 < (byte) 0xE0) {

1476 // Two-byte form	1491 // Two-byte form

1477	1492

1478 if (remaining == 0) {	1493 if (remaining == 0) {

1479 // Incomplete sequence	1494 // Incomplete sequence

1480 return byte1;	1495 return byte1;

1481 }	1496 }

1482 remaining--;	1497 remaining--;

1483	1498

1484 // Simultaneously checks for illegal trailing-byte in	1499 // Simultaneously checks for illegal trailing-byte in

1485 // leading position and overlong 2-byte form.	1500 // leading position and overlong 2-byte form.

1486 if (byte1 < (byte) 0xC2 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF ) {	1501 if (byte1 < (byte) 0xC2 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {

1487 return MALFORMED;	1502 return MALFORMED;

1488 }	1503 }

1489 } else if (byte1 < (byte) 0xF0) {	1504 } else if (byte1 < (byte) 0xF0) {

1490 // Three-byte form	1505 // Three-byte form

1491	1506

1492 if (remaining < 2) {	1507 if (remaining < 2) {

1493 // Incomplete sequence	1508 // Incomplete sequence

1494 return unsafeIncompleteStateFor(address, byte1, remaining);	1509 return unsafeIncompleteStateFor(address, byte1, remaining);

1495 }	1510 }

1496 remaining -= 2;	1511 remaining -= 2;

1497	1512

1498 final byte byte2 = UnsafeUtil.getByte(address++);	1513 final byte byte2 = UNSAFE.getByte(address++);

1499 if (byte2 > (byte) 0xBF	1514 if (byte2 > (byte) 0xBF

1500 // overlong? 5 most significant bits must not all be zero	1515 // overlong? 5 most significant bits must not all be zero

1501 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)	1516 \|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)

1502 // check for illegal surrogate codepoints	1517 // check for illegal surrogate codepoints

1503 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)	1518 \|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)

1504 // byte3 trailing-byte test	1519 // byte3 trailing-byte test

1505 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {	1520 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {

1506 return MALFORMED;	1521 return MALFORMED;

1507 }	1522 }

1508 } else {	1523 } else {

1509 // Four-byte form	1524 // Four-byte form

1510	1525

1511 if (remaining < 3) {	1526 if (remaining < 3) {

1512 // Incomplete sequence	1527 // Incomplete sequence

1513 return unsafeIncompleteStateFor(address, byte1, remaining);	1528 return unsafeIncompleteStateFor(address, byte1, remaining);

1514 }	1529 }

1515 remaining -= 3;	1530 remaining -= 3;

1516	1531

1517 final byte byte2 = UnsafeUtil.getByte(address++);	1532 final byte byte2 = UNSAFE.getByte(address++);

1518 if (byte2 > (byte) 0xBF	1533 if (byte2 > (byte) 0xBF

1519 // Check that 1 <= plane <= 16. Tricky optimized form of:	1534 // Check that 1 <= plane <= 16. Tricky optimized form of:

1520 // if (byte1 > (byte) 0xF4 \|\|	1535 // if (byte1 > (byte) 0xF4 \|\|

1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|	1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|

1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)	1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)

1523 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0	1538 \|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0

1524 // byte3 trailing-byte test	1539 // byte3 trailing-byte test

1525 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF	1540 \|\| UNSAFE.getByte(address++) > (byte) 0xBF

1526 // byte4 trailing-byte test	1541 // byte4 trailing-byte test

1527 \|\| UnsafeUtil.getByte(address++) > (byte) 0xBF) {	1542 \|\| UNSAFE.getByte(address++) > (byte) 0xBF) {

1528 return MALFORMED;	1543 return MALFORMED;

1529 }	1544 }

1530 }	1545 }

1531 }	1546 }

1532 }	1547 }

1533	1548

1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset,	1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of fset,

1535 int remaining) {	1550 int remaining) {

1536 switch (remaining) {	1551 switch (remaining) {

1537 case 0: {	1552 case 0: {

1538 return incompleteStateFor(byte1);	1553 return incompleteStateFor(byte1);

1539 }	1554 }

1540 case 1: {	1555 case 1: {

1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));	1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset));

1542 }	1557 }

1543 case 2: {	1558 case 2: {

1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset),	1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset),

1545 UnsafeUtil.getByte(bytes, offset + 1));	1560 UNSAFE.getByte(bytes, offset + 1));

1546 }	1561 }

1547 default: {	1562 default: {

1548 throw new AssertionError();	1563 throw new AssertionError();

1549 }	1564 }

1550 }	1565 }

1551 }	1566 }

1552	1567

1553 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) {	1568 private static int unsafeIncompleteStateFor(long address, final int byte1, i nt remaining) {

1554 switch (remaining) {	1569 switch (remaining) {

1555 case 0: {	1570 case 0: {

1556 return incompleteStateFor(byte1);	1571 return incompleteStateFor(byte1);

1557 }	1572 }

1558 case 1: {	1573 case 1: {

1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address));	1574 return incompleteStateFor(byte1, UNSAFE.getByte(address));

1560 }	1575 }

1561 case 2: {	1576 case 2: {

1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address),	1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getBy te(address + 1));

1563 UnsafeUtil.getByte(address + 1));

1564 }	1578 }

1565 default: {	1579 default: {

1566 throw new AssertionError();	1580 throw new AssertionError();

1567 }	1581 }

1568 }	1582 }

1569 }	1583 }

	1584

	1585 /**

	1586 * Gets the field with the given name within the class, or {@code null} if n ot found. If

	1587 * found, the field is made accessible.

	1588 */

	1589 private static Field field(Class<?> clazz, String fieldName) {

	1590 Field field;

	1591 try {

	1592 field = clazz.getDeclaredField(fieldName);

	1593 field.setAccessible(true);

	1594 } catch (Throwable t) {

	1595 // Failed to access the fields.

	1596 field = null;

	1597 }

	1598 logger.log(Level.FINEST, "{0}.{1}: {2}",

	1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available" : "unavailable")});

	1600 return field;

	1601 }

	1602

	1603 /**

	1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.mis c.Unsafe} is not

	1605 * available.

	1606 */

	1607 private static long fieldOffset(Field field) {

	1608 return field == null \|\| UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(fie ld);

	1609 }

	1610

	1611 /**

	1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Uns afe} is not

	1613 * available.

	1614 */

	1615 private static <T> int byteArrayBaseOffset() {

	1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class);

	1617 }

	1618

	1619 /**

	1620 * Gets the offset of the {@code address} field of the given direct {@link B yteBuffer}.

	1621 */

	1622 private static long addressOffset(ByteBuffer buffer) {

	1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET);

	1624 }

	1625

	1626 /**

	1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not availab le on this

	1628 * platform.

	1629 */

	1630 private static sun.misc.Unsafe getUnsafe() {

	1631 sun.misc.Unsafe unsafe = null;

	1632 try {

	1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun .misc.Unsafe>() {

	1634 @Override

	1635 public sun.misc.Unsafe run() throws Exception {

	1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class;

	1637

	1638 // Check that this platform supports all of the required unsafe meth ods.

	1639 checkRequiredMethods(k);

	1640

	1641 for (Field f : k.getDeclaredFields()) {

	1642 f.setAccessible(true);

	1643 Object x = f.get(null);

	1644 if (k.isInstance(x)) {

	1645 return k.cast(x);

	1646 }

	1647 }

	1648 // The sun.misc.Unsafe field does not exist.

	1649 return null;

	1650 }

	1651 });

	1652 } catch (Throwable e) {

	1653 // Catching Throwable here due to the fact that Google AppEngine raises NoClassDefFoundError

	1654 // for Unsafe.

	1655 }

	1656

	1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}",

	1658 unsafe != null ? "available" : "unavailable");

	1659 return unsafe;

	1660 }

	1661

	1662 /**

	1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are availab le on this platform.

	1664 */

	1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz)

	1666 throws NoSuchMethodException, SecurityException {

	1667 // Needed for Unsafe byte[] access

	1668 clazz.getMethod("arrayBaseOffset", Class.class);

	1669 clazz.getMethod("getByte", Object.class, long.class);

	1670 clazz.getMethod("putByte", Object.class, long.class, byte.class);

	1671 clazz.getMethod("getLong", Object.class, long.class);

	1672

	1673 // Needed for Unsafe Direct ByteBuffer access

	1674 clazz.getMethod("objectFieldOffset", Field.class);

	1675 clazz.getMethod("getByte", long.class);

	1676 clazz.getMethod("getLong", Object.class, long.class);

	1677 clazz.getMethod("putByte", long.class, byte.class);

	1678 clazz.getMethod("getLong", long.class);

	1679 }

1570 }	1680 }

1571	1681

1572 private Utf8() {}	1682 private Utf8() {}

1573 }	1683 }

OLD	NEW