| OLD | NEW |
| 1 // Protocol Buffers - Google's data interchange format | 1 // Protocol Buffers - Google's data interchange format |
| 2 // Copyright 2008 Google Inc. All rights reserved. | 2 // Copyright 2008 Google Inc. All rights reserved. |
| 3 // https://developers.google.com/protocol-buffers/ | 3 // https://developers.google.com/protocol-buffers/ |
| 4 // | 4 // |
| 5 // Redistribution and use in source and binary forms, with or without | 5 // Redistribution and use in source and binary forms, with or without |
| 6 // modification, are permitted provided that the following conditions are | 6 // modification, are permitted provided that the following conditions are |
| 7 // met: | 7 // met: |
| 8 // | 8 // |
| 9 // * Redistributions of source code must retain the above copyright | 9 // * Redistributions of source code must retain the above copyright |
| 10 // notice, this list of conditions and the following disclaimer. | 10 // notice, this list of conditions and the following disclaimer. |
| (...skipping 12 matching lines...) Expand all Loading... |
| 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 30 | 30 |
| 31 package com.google.protobuf; | 31 package com.google.protobuf; |
| 32 | 32 |
| 33 import static com.google.protobuf.UnsafeUtil.addressOffset; |
| 34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset; |
| 35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations; |
| 36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations; |
| 33 import static java.lang.Character.MAX_SURROGATE; | 37 import static java.lang.Character.MAX_SURROGATE; |
| 34 import static java.lang.Character.MIN_SURROGATE; | 38 import static java.lang.Character.MIN_SURROGATE; |
| 35 import static java.lang.Character.isSurrogatePair; | 39 import static java.lang.Character.isSurrogatePair; |
| 36 import static java.lang.Character.toCodePoint; | 40 import static java.lang.Character.toCodePoint; |
| 37 | 41 |
| 38 import java.lang.reflect.Field; | |
| 39 import java.nio.Buffer; | |
| 40 import java.nio.ByteBuffer; | 42 import java.nio.ByteBuffer; |
| 41 import java.security.AccessController; | |
| 42 import java.security.PrivilegedExceptionAction; | |
| 43 import java.util.logging.Level; | |
| 44 import java.util.logging.Logger; | |
| 45 | 43 |
| 46 /** | 44 /** |
| 47 * A set of low-level, high-performance static utility methods related | 45 * A set of low-level, high-performance static utility methods related |
| 48 * to the UTF-8 character encoding. This class has no dependencies | 46 * to the UTF-8 character encoding. This class has no dependencies |
| 49 * outside of the core JDK libraries. | 47 * outside of the core JDK libraries. |
| 50 * | 48 * |
| 51 * <p>There are several variants of UTF-8. The one implemented by | 49 * <p>There are several variants of UTF-8. The one implemented by |
| 52 * this class is the restricted definition of UTF-8 introduced in | 50 * this class is the restricted definition of UTF-8 introduced in |
| 53 * Unicode 3.1, which mandates the rejection of "overlong" byte | 51 * Unicode 3.1, which mandates the rejection of "overlong" byte |
| 54 * sequences as well as rejection of 3-byte surrogate codepoint byte | 52 * sequences as well as rejection of 3-byte surrogate codepoint byte |
| (...skipping 17 matching lines...) Expand all Loading... |
| 72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is | 70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is |
| 73 * well-formed in the absence of additional input, or if the byte sequence | 71 * well-formed in the absence of additional input, or if the byte sequence |
| 74 * apparently terminated in the middle of a character, an opaque integer | 72 * apparently terminated in the middle of a character, an opaque integer |
| 75 * "state" value containing enough information to decode the character when | 73 * "state" value containing enough information to decode the character when |
| 76 * passed to a subsequent invocation of a partial decoding method. | 74 * passed to a subsequent invocation of a partial decoding method. |
| 77 * | 75 * |
| 78 * @author martinrb@google.com (Martin Buchholz) | 76 * @author martinrb@google.com (Martin Buchholz) |
| 79 */ | 77 */ |
| 80 // TODO(nathanmittler): Copy changes in this class back to Guava | 78 // TODO(nathanmittler): Copy changes in this class back to Guava |
| 81 final class Utf8 { | 79 final class Utf8 { |
| 82 private static final Logger logger = Logger.getLogger(Utf8.class.getName()); | |
| 83 | 80 |
| 84 /** | 81 /** |
| 85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl
ementations | 82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl
ementations |
| 86 * depending on what is available on the platform. The processor is the platfo
rm-optimized | 83 * depending on what is available on the platform. The processor is the platfo
rm-optimized |
| 87 * delegate for which all methods are delegated directly to. | 84 * delegate for which all methods are delegated directly to. |
| 88 */ | 85 */ |
| 89 private static final Processor processor = | 86 private static final Processor processor = |
| 90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(
); | 87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(
); |
| 91 | 88 |
| 92 /** | 89 /** |
| (...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 230 default: | 227 default: |
| 231 throw new AssertionError(); | 228 throw new AssertionError(); |
| 232 } | 229 } |
| 233 } | 230 } |
| 234 | 231 |
| 235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi
fication to throw | 232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi
fication to throw |
| 236 // a protocol buffer local exception. This exception is then caught in CodedOu
tputStream so it can | 233 // a protocol buffer local exception. This exception is then caught in CodedOu
tputStream so it can |
| 237 // fallback to more lenient behavior. | 234 // fallback to more lenient behavior. |
| 238 | 235 |
| 239 static class UnpairedSurrogateException extends IllegalArgumentException { | 236 static class UnpairedSurrogateException extends IllegalArgumentException { |
| 240 private UnpairedSurrogateException(int index, int length) { | 237 UnpairedSurrogateException(int index, int length) { |
| 241 super("Unpaired surrogate at index " + index + " of " + length); | 238 super("Unpaired surrogate at index " + index + " of " + length); |
| 242 } | 239 } |
| 243 } | 240 } |
| 244 | 241 |
| 245 /** | 242 /** |
| 246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
For a string, | 243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
For a string, |
| 247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is
more efficient in | 244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is
more efficient in |
| 248 * both time and space. | 245 * both time and space. |
| 249 * | 246 * |
| 250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT
F-16 (unpaired | 247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT
F-16 (unpaired |
| (...skipping 733 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 984 } | 981 } |
| 985 } | 982 } |
| 986 } | 983 } |
| 987 } | 984 } |
| 988 } | 985 } |
| 989 | 986 |
| 990 /** | 987 /** |
| 991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro
ve performance. | 988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro
ve performance. |
| 992 */ | 989 */ |
| 993 static final class UnsafeProcessor extends Processor { | 990 static final class UnsafeProcessor extends Processor { |
| 994 private static final sun.misc.Unsafe UNSAFE = getUnsafe(); | |
| 995 private static final long BUFFER_ADDRESS_OFFSET = | |
| 996 fieldOffset(field(Buffer.class, "address")); | |
| 997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset(); | |
| 998 | |
| 999 /** | |
| 1000 * We only use Unsafe operations if we have access to direct {@link ByteBuff
er}'s address | |
| 1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong())
. | |
| 1002 */ | |
| 1003 private static final boolean AVAILABLE = | |
| 1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0; | |
| 1005 | |
| 1006 /** | 991 /** |
| 1007 * Indicates whether or not all required unsafe operations are supported on
this platform. | 992 * Indicates whether or not all required unsafe operations are supported on
this platform. |
| 1008 */ | 993 */ |
| 1009 static boolean isAvailable() { | 994 static boolean isAvailable() { |
| 1010 return AVAILABLE; | 995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); |
| 1011 } | 996 } |
| 1012 | 997 |
| 1013 @Override | 998 @Override |
| 1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l
imit) { | 999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l
imit) { |
| 1015 if ((index | limit | bytes.length - limit) < 0) { | 1000 if ((index | limit | bytes.length - limit) < 0) { |
| 1016 throw new ArrayIndexOutOfBoundsException( | 1001 throw new ArrayIndexOutOfBoundsException( |
| 1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i
ndex, limit)); | 1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i
ndex, limit)); |
| 1018 } | 1003 } |
| 1019 long offset = ARRAY_BASE_OFFSET + index; | 1004 long offset = getArrayBaseOffset() + index; |
| 1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit; | 1005 final long offsetLimit = getArrayBaseOffset() + limit; |
| 1021 if (state != COMPLETE) { | 1006 if (state != COMPLETE) { |
| 1022 // The previous decoding operation was incomplete (or malformed). | 1007 // The previous decoding operation was incomplete (or malformed). |
| 1023 // We look for a well-formed sequence consisting of bytes from | 1008 // We look for a well-formed sequence consisting of bytes from |
| 1024 // the previous decoding operation (stored in state) together | 1009 // the previous decoding operation (stored in state) together |
| 1025 // with bytes from the array slice. | 1010 // with bytes from the array slice. |
| 1026 // | 1011 // |
| 1027 // We expect such "straddler characters" to be rare. | 1012 // We expect such "straddler characters" to be rare. |
| 1028 | 1013 |
| 1029 if (offset >= offsetLimit) { // No bytes? No progress. | 1014 if (offset >= offsetLimit) { // No bytes? No progress. |
| 1030 return state; | 1015 return state; |
| 1031 } | 1016 } |
| 1032 int byte1 = (byte) state; | 1017 int byte1 = (byte) state; |
| 1033 // byte1 is never ASCII. | 1018 // byte1 is never ASCII. |
| 1034 if (byte1 < (byte) 0xE0) { | 1019 if (byte1 < (byte) 0xE0) { |
| 1035 // two-byte form | 1020 // two-byte form |
| 1036 | 1021 |
| 1037 // Simultaneously checks for illegal trailing-byte in | 1022 // Simultaneously checks for illegal trailing-byte in |
| 1038 // leading position and overlong 2-byte form. | 1023 // leading position and overlong 2-byte form. |
| 1039 if (byte1 < (byte) 0xC2 | 1024 if (byte1 < (byte) 0xC2 |
| 1040 // byte2 trailing-byte test | 1025 // byte2 trailing-byte test |
| 1041 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1026 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
| 1042 return MALFORMED; | 1027 return MALFORMED; |
| 1043 } | 1028 } |
| 1044 } else if (byte1 < (byte) 0xF0) { | 1029 } else if (byte1 < (byte) 0xF0) { |
| 1045 // three-byte form | 1030 // three-byte form |
| 1046 | 1031 |
| 1047 // Get byte2 from saved state or array | 1032 // Get byte2 from saved state or array |
| 1048 int byte2 = (byte) ~(state >> 8); | 1033 int byte2 = (byte) ~(state >> 8); |
| 1049 if (byte2 == 0) { | 1034 if (byte2 == 0) { |
| 1050 byte2 = UNSAFE.getByte(bytes, offset++); | 1035 byte2 = UnsafeUtil.getByte(bytes, offset++); |
| 1051 if (offset >= offsetLimit) { | 1036 if (offset >= offsetLimit) { |
| 1052 return incompleteStateFor(byte1, byte2); | 1037 return incompleteStateFor(byte1, byte2); |
| 1053 } | 1038 } |
| 1054 } | 1039 } |
| 1055 if (byte2 > (byte) 0xBF | 1040 if (byte2 > (byte) 0xBF |
| 1056 // overlong? 5 most significant bits must not all be zero | 1041 // overlong? 5 most significant bits must not all be zero |
| 1057 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1042 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
| 1058 // illegal surrogate codepoint? | 1043 // illegal surrogate codepoint? |
| 1059 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1044 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
| 1060 // byte3 trailing-byte test | 1045 // byte3 trailing-byte test |
| 1061 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1046 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
| 1062 return MALFORMED; | 1047 return MALFORMED; |
| 1063 } | 1048 } |
| 1064 } else { | 1049 } else { |
| 1065 // four-byte form | 1050 // four-byte form |
| 1066 | 1051 |
| 1067 // Get byte2 and byte3 from saved state or array | 1052 // Get byte2 and byte3 from saved state or array |
| 1068 int byte2 = (byte) ~(state >> 8); | 1053 int byte2 = (byte) ~(state >> 8); |
| 1069 int byte3 = 0; | 1054 int byte3 = 0; |
| 1070 if (byte2 == 0) { | 1055 if (byte2 == 0) { |
| 1071 byte2 = UNSAFE.getByte(bytes, offset++); | 1056 byte2 = UnsafeUtil.getByte(bytes, offset++); |
| 1072 if (offset >= offsetLimit) { | 1057 if (offset >= offsetLimit) { |
| 1073 return incompleteStateFor(byte1, byte2); | 1058 return incompleteStateFor(byte1, byte2); |
| 1074 } | 1059 } |
| 1075 } else { | 1060 } else { |
| 1076 byte3 = (byte) (state >> 16); | 1061 byte3 = (byte) (state >> 16); |
| 1077 } | 1062 } |
| 1078 if (byte3 == 0) { | 1063 if (byte3 == 0) { |
| 1079 byte3 = UNSAFE.getByte(bytes, offset++); | 1064 byte3 = UnsafeUtil.getByte(bytes, offset++); |
| 1080 if (offset >= offsetLimit) { | 1065 if (offset >= offsetLimit) { |
| 1081 return incompleteStateFor(byte1, byte2, byte3); | 1066 return incompleteStateFor(byte1, byte2, byte3); |
| 1082 } | 1067 } |
| 1083 } | 1068 } |
| 1084 | 1069 |
| 1085 // If we were called with state == MALFORMED, then byte1 is 0xFF, | 1070 // If we were called with state == MALFORMED, then byte1 is 0xFF, |
| 1086 // which never occurs in well-formed UTF-8, and so we will return | 1071 // which never occurs in well-formed UTF-8, and so we will return |
| 1087 // MALFORMED again below. | 1072 // MALFORMED again below. |
| 1088 | 1073 |
| 1089 if (byte2 > (byte) 0xBF | 1074 if (byte2 > (byte) 0xBF |
| 1090 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1075 // Check that 1 <= plane <= 16. Tricky optimized form of: |
| 1091 // if (byte1 > (byte) 0xF4 || | 1076 // if (byte1 > (byte) 0xF4 || |
| 1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
| 1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
| 1094 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1079 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
| 1095 // byte3 trailing-byte test | 1080 // byte3 trailing-byte test |
| 1096 || byte3 > (byte) 0xBF | 1081 || byte3 > (byte) 0xBF |
| 1097 // byte4 trailing-byte test | 1082 // byte4 trailing-byte test |
| 1098 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1083 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
| 1099 return MALFORMED; | 1084 return MALFORMED; |
| 1100 } | 1085 } |
| 1101 } | 1086 } |
| 1102 } | 1087 } |
| 1103 | 1088 |
| 1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); | 1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); |
| 1105 } | 1090 } |
| 1106 | 1091 |
| 1107 @Override | 1092 @Override |
| 1108 int partialIsValidUtf8Direct( | 1093 int partialIsValidUtf8Direct( |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1127 | 1112 |
| 1128 final int byte1 = (byte) state; | 1113 final int byte1 = (byte) state; |
| 1129 // byte1 is never ASCII. | 1114 // byte1 is never ASCII. |
| 1130 if (byte1 < (byte) 0xE0) { | 1115 if (byte1 < (byte) 0xE0) { |
| 1131 // two-byte form | 1116 // two-byte form |
| 1132 | 1117 |
| 1133 // Simultaneously checks for illegal trailing-byte in | 1118 // Simultaneously checks for illegal trailing-byte in |
| 1134 // leading position and overlong 2-byte form. | 1119 // leading position and overlong 2-byte form. |
| 1135 if (byte1 < (byte) 0xC2 | 1120 if (byte1 < (byte) 0xC2 |
| 1136 // byte2 trailing-byte test | 1121 // byte2 trailing-byte test |
| 1137 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1122 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
| 1138 return MALFORMED; | 1123 return MALFORMED; |
| 1139 } | 1124 } |
| 1140 } else if (byte1 < (byte) 0xF0) { | 1125 } else if (byte1 < (byte) 0xF0) { |
| 1141 // three-byte form | 1126 // three-byte form |
| 1142 | 1127 |
| 1143 // Get byte2 from saved state or array | 1128 // Get byte2 from saved state or array |
| 1144 int byte2 = (byte) ~(state >> 8); | 1129 int byte2 = (byte) ~(state >> 8); |
| 1145 if (byte2 == 0) { | 1130 if (byte2 == 0) { |
| 1146 byte2 = UNSAFE.getByte(address++); | 1131 byte2 = UnsafeUtil.getByte(address++); |
| 1147 if (address >= addressLimit) { | 1132 if (address >= addressLimit) { |
| 1148 return incompleteStateFor(byte1, byte2); | 1133 return incompleteStateFor(byte1, byte2); |
| 1149 } | 1134 } |
| 1150 } | 1135 } |
| 1151 if (byte2 > (byte) 0xBF | 1136 if (byte2 > (byte) 0xBF |
| 1152 // overlong? 5 most significant bits must not all be zero | 1137 // overlong? 5 most significant bits must not all be zero |
| 1153 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1138 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
| 1154 // illegal surrogate codepoint? | 1139 // illegal surrogate codepoint? |
| 1155 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1140 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
| 1156 // byte3 trailing-byte test | 1141 // byte3 trailing-byte test |
| 1157 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1142 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
| 1158 return MALFORMED; | 1143 return MALFORMED; |
| 1159 } | 1144 } |
| 1160 } else { | 1145 } else { |
| 1161 // four-byte form | 1146 // four-byte form |
| 1162 | 1147 |
| 1163 // Get byte2 and byte3 from saved state or array | 1148 // Get byte2 and byte3 from saved state or array |
| 1164 int byte2 = (byte) ~(state >> 8); | 1149 int byte2 = (byte) ~(state >> 8); |
| 1165 int byte3 = 0; | 1150 int byte3 = 0; |
| 1166 if (byte2 == 0) { | 1151 if (byte2 == 0) { |
| 1167 byte2 = UNSAFE.getByte(address++); | 1152 byte2 = UnsafeUtil.getByte(address++); |
| 1168 if (address >= addressLimit) { | 1153 if (address >= addressLimit) { |
| 1169 return incompleteStateFor(byte1, byte2); | 1154 return incompleteStateFor(byte1, byte2); |
| 1170 } | 1155 } |
| 1171 } else { | 1156 } else { |
| 1172 byte3 = (byte) (state >> 16); | 1157 byte3 = (byte) (state >> 16); |
| 1173 } | 1158 } |
| 1174 if (byte3 == 0) { | 1159 if (byte3 == 0) { |
| 1175 byte3 = UNSAFE.getByte(address++); | 1160 byte3 = UnsafeUtil.getByte(address++); |
| 1176 if (address >= addressLimit) { | 1161 if (address >= addressLimit) { |
| 1177 return incompleteStateFor(byte1, byte2, byte3); | 1162 return incompleteStateFor(byte1, byte2, byte3); |
| 1178 } | 1163 } |
| 1179 } | 1164 } |
| 1180 | 1165 |
| 1181 // If we were called with state == MALFORMED, then byte1 is 0xFF, | 1166 // If we were called with state == MALFORMED, then byte1 is 0xFF, |
| 1182 // which never occurs in well-formed UTF-8, and so we will return | 1167 // which never occurs in well-formed UTF-8, and so we will return |
| 1183 // MALFORMED again below. | 1168 // MALFORMED again below. |
| 1184 | 1169 |
| 1185 if (byte2 > (byte) 0xBF | 1170 if (byte2 > (byte) 0xBF |
| 1186 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1171 // Check that 1 <= plane <= 16. Tricky optimized form of: |
| 1187 // if (byte1 > (byte) 0xF4 || | 1172 // if (byte1 > (byte) 0xF4 || |
| 1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
| 1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
| 1190 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1175 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
| 1191 // byte3 trailing-byte test | 1176 // byte3 trailing-byte test |
| 1192 || byte3 > (byte) 0xBF | 1177 || byte3 > (byte) 0xBF |
| 1193 // byte4 trailing-byte test | 1178 // byte4 trailing-byte test |
| 1194 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1179 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
| 1195 return MALFORMED; | 1180 return MALFORMED; |
| 1196 } | 1181 } |
| 1197 } | 1182 } |
| 1198 } | 1183 } |
| 1199 | 1184 |
| 1200 return partialIsValidUtf8(address, (int) (addressLimit - address)); | 1185 return partialIsValidUtf8(address, (int) (addressLimit - address)); |
| 1201 } | 1186 } |
| 1202 | 1187 |
| 1203 @Override | 1188 @Override |
| 1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi
nal int length) { | 1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi
nal int length) { |
| 1205 long outIx = ARRAY_BASE_OFFSET + offset; | 1190 long outIx = getArrayBaseOffset() + offset; |
| 1206 final long outLimit = outIx + length; | 1191 final long outLimit = outIx + length; |
| 1207 final int inLimit = in.length(); | 1192 final int inLimit = in.length(); |
| 1208 if (inLimit > length || out.length - length < offset) { | 1193 if (inLimit > length || out.length - length < offset) { |
| 1209 // Not even enough room for an ASCII-encoded string. | 1194 // Not even enough room for an ASCII-encoded string. |
| 1210 throw new ArrayIndexOutOfBoundsException( | 1195 throw new ArrayIndexOutOfBoundsException( |
| 1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset
+ length)); | 1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset
+ length)); |
| 1212 } | 1197 } |
| 1213 | 1198 |
| 1214 // Designed to take advantage of | 1199 // Designed to take advantage of |
| 1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination | 1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination |
| 1216 int inIx = 0; | 1201 int inIx = 0; |
| 1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { | 1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
| 1218 UNSAFE.putByte(out, outIx++, (byte) c); | 1203 UnsafeUtil.putByte(out, outIx++, (byte) c); |
| 1219 } | 1204 } |
| 1220 if (inIx == inLimit) { | 1205 if (inIx == inLimit) { |
| 1221 // We're done, it was ASCII encoded. | 1206 // We're done, it was ASCII encoded. |
| 1222 return (int) (outIx - ARRAY_BASE_OFFSET); | 1207 return (int) (outIx - getArrayBaseOffset()); |
| 1223 } | 1208 } |
| 1224 | 1209 |
| 1225 for (char c; inIx < inLimit; ++inIx) { | 1210 for (char c; inIx < inLimit; ++inIx) { |
| 1226 c = in.charAt(inIx); | 1211 c = in.charAt(inIx); |
| 1227 if (c < 0x80 && outIx < outLimit) { | 1212 if (c < 0x80 && outIx < outLimit) { |
| 1228 UNSAFE.putByte(out, outIx++, (byte) c); | 1213 UnsafeUtil.putByte(out, outIx++, (byte) c); |
| 1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes | 1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes |
| 1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); | 1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); |
| 1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); | 1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); |
| 1232 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { | 1217 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { |
| 1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s | 1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s |
| 1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); | 1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); |
| 1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); | 1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
| 1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); | 1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); |
| 1237 } else if (outIx <= outLimit - 4L) { | 1222 } else if (outIx <= outLimit - 4L) { |
| 1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 | 1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 |
| 1239 // bytes | 1224 // bytes |
| 1240 final char low; | 1225 final char low; |
| 1241 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { | 1226 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { |
| 1242 throw new UnpairedSurrogateException((inIx - 1), inLimit); | 1227 throw new UnpairedSurrogateException((inIx - 1), inLimit); |
| 1243 } | 1228 } |
| 1244 int codePoint = toCodePoint(c, low); | 1229 int codePoint = toCodePoint(c, low); |
| 1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)))
; | 1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 1
8))); |
| 1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)
))); | 1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>>
12)))); |
| 1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))
)); | 1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>>
6)))); |
| 1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); | 1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); |
| 1249 } else { | 1234 } else { |
| 1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) | 1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) |
| 1251 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { | 1236 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { |
| 1252 // We are surrogates and we're not a surrogate pair. | 1237 // We are surrogates and we're not a surrogate pair. |
| 1253 throw new UnpairedSurrogateException(inIx, inLimit); | 1238 throw new UnpairedSurrogateException(inIx, inLimit); |
| 1254 } | 1239 } |
| 1255 // Not enough space in the output buffer. | 1240 // Not enough space in the output buffer. |
| 1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); | 1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); |
| 1257 } | 1242 } |
| 1258 } | 1243 } |
| 1259 | 1244 |
| 1260 // All bytes have been encoded. | 1245 // All bytes have been encoded. |
| 1261 return (int) (outIx - ARRAY_BASE_OFFSET); | 1246 return (int) (outIx - getArrayBaseOffset()); |
| 1262 } | 1247 } |
| 1263 | 1248 |
| 1264 @Override | 1249 @Override |
| 1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { | 1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { |
| 1266 final long address = addressOffset(out); | 1251 final long address = addressOffset(out); |
| 1267 long outIx = address + out.position(); | 1252 long outIx = address + out.position(); |
| 1268 final long outLimit = address + out.limit(); | 1253 final long outLimit = address + out.limit(); |
| 1269 final int inLimit = in.length(); | 1254 final int inLimit = in.length(); |
| 1270 if (inLimit > outLimit - outIx) { | 1255 if (inLimit > outLimit - outIx) { |
| 1271 // Not even enough room for an ASCII-encoded string. | 1256 // Not even enough room for an ASCII-encoded string. |
| 1272 throw new ArrayIndexOutOfBoundsException( | 1257 throw new ArrayIndexOutOfBoundsException( |
| 1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi
t()); | 1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi
t()); |
| 1274 } | 1259 } |
| 1275 | 1260 |
| 1276 // Designed to take advantage of | 1261 // Designed to take advantage of |
| 1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination | 1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination |
| 1278 int inIx = 0; | 1263 int inIx = 0; |
| 1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { | 1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
| 1280 UNSAFE.putByte(outIx++, (byte) c); | 1265 UnsafeUtil.putByte(outIx++, (byte) c); |
| 1281 } | 1266 } |
| 1282 if (inIx == inLimit) { | 1267 if (inIx == inLimit) { |
| 1283 // We're done, it was ASCII encoded. | 1268 // We're done, it was ASCII encoded. |
| 1284 out.position((int) (outIx - address)); | 1269 out.position((int) (outIx - address)); |
| 1285 return; | 1270 return; |
| 1286 } | 1271 } |
| 1287 | 1272 |
| 1288 for (char c; inIx < inLimit; ++inIx) { | 1273 for (char c; inIx < inLimit; ++inIx) { |
| 1289 c = in.charAt(inIx); | 1274 c = in.charAt(inIx); |
| 1290 if (c < 0x80 && outIx < outLimit) { | 1275 if (c < 0x80 && outIx < outLimit) { |
| 1291 UNSAFE.putByte(outIx++, (byte) c); | 1276 UnsafeUtil.putByte(outIx++, (byte) c); |
| 1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes | 1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes |
| 1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); | 1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); |
| 1294 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); | 1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
| 1295 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { | 1280 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { |
| 1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s | 1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s |
| 1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); | 1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); |
| 1298 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); | 1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
| 1299 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); | 1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
| 1300 } else if (outIx <= outLimit - 4L) { | 1285 } else if (outIx <= outLimit - 4L) { |
| 1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 | 1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 |
| 1302 // bytes | 1287 // bytes |
| 1303 final char low; | 1288 final char low; |
| 1304 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { | 1289 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { |
| 1305 throw new UnpairedSurrogateException((inIx - 1), inLimit); | 1290 throw new UnpairedSurrogateException((inIx - 1), inLimit); |
| 1306 } | 1291 } |
| 1307 int codePoint = toCodePoint(c, low); | 1292 int codePoint = toCodePoint(c, low); |
| 1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); | 1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); |
| 1309 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); | 1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))
)); |
| 1310 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); | 1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))
); |
| 1311 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); | 1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); |
| 1312 } else { | 1297 } else { |
| 1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) | 1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) |
| 1314 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { | 1299 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { |
| 1315 // We are surrogates and we're not a surrogate pair. | 1300 // We are surrogates and we're not a surrogate pair. |
| 1316 throw new UnpairedSurrogateException(inIx, inLimit); | 1301 throw new UnpairedSurrogateException(inIx, inLimit); |
| 1317 } | 1302 } |
| 1318 // Not enough space in the output buffer. | 1303 // Not enough space in the output buffer. |
| 1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); | 1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); |
| 1320 } | 1305 } |
| 1321 } | 1306 } |
| (...skipping 20 matching lines...) Expand all Loading... |
| 1342 return 0; | 1327 return 0; |
| 1343 } | 1328 } |
| 1344 | 1329 |
| 1345 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. | 1330 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. |
| 1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to
make sure that | 1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to
make sure that |
| 1347 // the index (relative to the start of the array) is also 8-byte aligned.
We do this by | 1332 // the index (relative to the start of the array) is also 8-byte aligned.
We do this by |
| 1348 // ANDing the index with 7 to determine the number of bytes that need to b
e read before | 1333 // ANDing the index with 7 to determine the number of bytes that need to b
e read before |
| 1349 // we're 8-byte aligned. | 1334 // we're 8-byte aligned. |
| 1350 final int unaligned = (int) offset & 7; | 1335 final int unaligned = (int) offset & 7; |
| 1351 for (int j = unaligned; j > 0; j--) { | 1336 for (int j = unaligned; j > 0; j--) { |
| 1352 if (UNSAFE.getByte(bytes, offset++) < 0) { | 1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) { |
| 1353 return unaligned - j; | 1338 return unaligned - j; |
| 1354 } | 1339 } |
| 1355 } | 1340 } |
| 1356 | 1341 |
| 1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). | 1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). |
| 1358 // To speed things up further, we're reading longs instead of bytes so we
use a mask to | 1343 // To speed things up further, we're reading longs instead of bytes so we
use a mask to |
| 1359 // determine if any byte in the current long is non-ASCII. | 1344 // determine if any byte in the current long is non-ASCII. |
| 1360 remaining -= unaligned; | 1345 remaining -= unaligned; |
| 1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG)
== 0; | 1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_L
ONG) == 0; |
| 1362 offset += 8, remaining -= 8) {} | 1347 offset += 8, remaining -= 8) {} |
| 1363 return maxChars - remaining; | 1348 return maxChars - remaining; |
| 1364 } | 1349 } |
| 1365 | 1350 |
| 1366 /** | 1351 /** |
| 1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep
t that it uses the | 1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep
t that it uses the |
| 1368 * most efficient method available to the platform. | 1353 * most efficient method available to the platform. |
| 1369 */ | 1354 */ |
| 1370 private static int unsafeEstimateConsecutiveAscii(long address, final int ma
xChars) { | 1355 private static int unsafeEstimateConsecutiveAscii(long address, final int ma
xChars) { |
| 1371 int remaining = maxChars; | 1356 int remaining = maxChars; |
| 1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { | 1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { |
| 1373 // Don't bother with small strings. | 1358 // Don't bother with small strings. |
| 1374 return 0; | 1359 return 0; |
| 1375 } | 1360 } |
| 1376 | 1361 |
| 1377 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. | 1362 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. |
| 1378 // We do this by ANDing the address with 7 to determine the number of byte
s that need to | 1363 // We do this by ANDing the address with 7 to determine the number of byte
s that need to |
| 1379 // be read before we're 8-byte aligned. | 1364 // be read before we're 8-byte aligned. |
| 1380 final int unaligned = (int) address & 7; | 1365 final int unaligned = (int) address & 7; |
| 1381 for (int j = unaligned; j > 0; j--) { | 1366 for (int j = unaligned; j > 0; j--) { |
| 1382 if (UNSAFE.getByte(address++) < 0) { | 1367 if (UnsafeUtil.getByte(address++) < 0) { |
| 1383 return unaligned - j; | 1368 return unaligned - j; |
| 1384 } | 1369 } |
| 1385 } | 1370 } |
| 1386 | 1371 |
| 1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). | 1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). |
| 1388 // To speed things up further, we're reading longs instead of bytes so we
use a mask to | 1373 // To speed things up further, we're reading longs instead of bytes so we
use a mask to |
| 1389 // determine if any byte in the current long is non-ASCII. | 1374 // determine if any byte in the current long is non-ASCII. |
| 1390 remaining -= unaligned; | 1375 remaining -= unaligned; |
| 1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0; | 1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) =
= 0; |
| 1392 address += 8, remaining -= 8) {} | 1377 address += 8, remaining -= 8) {} |
| 1393 return maxChars - remaining; | 1378 return maxChars - remaining; |
| 1394 } | 1379 } |
| 1395 | 1380 |
| 1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r
emaining) { | 1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r
emaining) { |
| 1397 // Skip past ASCII characters as quickly as possible. | 1382 // Skip past ASCII characters as quickly as possible. |
| 1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin
g); | 1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin
g); |
| 1399 remaining -= skipped; | 1384 remaining -= skipped; |
| 1400 offset += skipped; | 1385 offset += skipped; |
| 1401 | 1386 |
| 1402 for (;;) { | 1387 for (;;) { |
| 1403 // Optimize for interior runs of ASCII bytes. | 1388 // Optimize for interior runs of ASCII bytes. |
| 1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? | 1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? |
| 1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? | 1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? |
| 1406 int byte1 = 0; | 1391 int byte1 = 0; |
| 1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0;
--remaining) { | 1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >=
0; --remaining) { |
| 1408 } | 1393 } |
| 1409 if (remaining == 0) { | 1394 if (remaining == 0) { |
| 1410 return COMPLETE; | 1395 return COMPLETE; |
| 1411 } | 1396 } |
| 1412 remaining--; | 1397 remaining--; |
| 1413 | 1398 |
| 1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. | 1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. |
| 1415 if (byte1 < (byte) 0xE0) { | 1400 if (byte1 < (byte) 0xE0) { |
| 1416 // Two-byte form (110xxxxx 10xxxxxx) | 1401 // Two-byte form (110xxxxx 10xxxxxx) |
| 1417 if (remaining == 0) { | 1402 if (remaining == 0) { |
| 1418 // Incomplete sequence | 1403 // Incomplete sequence |
| 1419 return byte1; | 1404 return byte1; |
| 1420 } | 1405 } |
| 1421 remaining--; | 1406 remaining--; |
| 1422 | 1407 |
| 1423 // Simultaneously checks for illegal trailing-byte in | 1408 // Simultaneously checks for illegal trailing-byte in |
| 1424 // leading position and overlong 2-byte form. | 1409 // leading position and overlong 2-byte form. |
| 1425 if (byte1 < (byte) 0xC2 | 1410 if (byte1 < (byte) 0xC2 |
| 1426 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1411 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
| 1427 return MALFORMED; | 1412 return MALFORMED; |
| 1428 } | 1413 } |
| 1429 } else if (byte1 < (byte) 0xF0) { | 1414 } else if (byte1 < (byte) 0xF0) { |
| 1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) | 1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) |
| 1431 if (remaining < 2) { | 1416 if (remaining < 2) { |
| 1432 // Incomplete sequence | 1417 // Incomplete sequence |
| 1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); | 1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); |
| 1434 } | 1419 } |
| 1435 remaining -= 2; | 1420 remaining -= 2; |
| 1436 | 1421 |
| 1437 final int byte2; | 1422 final int byte2; |
| 1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF | 1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF |
| 1439 // overlong? 5 most significant bits must not all be zero | 1424 // overlong? 5 most significant bits must not all be zero |
| 1440 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1425 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
| 1441 // check for illegal surrogate codepoints | 1426 // check for illegal surrogate codepoints |
| 1442 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1427 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
| 1443 // byte3 trailing-byte test | 1428 // byte3 trailing-byte test |
| 1444 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1429 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
| 1445 return MALFORMED; | 1430 return MALFORMED; |
| 1446 } | 1431 } |
| 1447 } else { | 1432 } else { |
| 1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) | 1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) |
| 1449 if (remaining < 3) { | 1434 if (remaining < 3) { |
| 1450 // Incomplete sequence | 1435 // Incomplete sequence |
| 1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); | 1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); |
| 1452 } | 1437 } |
| 1453 remaining -= 3; | 1438 remaining -= 3; |
| 1454 | 1439 |
| 1455 final int byte2; | 1440 final int byte2; |
| 1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF | 1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF |
| 1457 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1442 // Check that 1 <= plane <= 16. Tricky optimized form of: |
| 1458 // if (byte1 > (byte) 0xF4 || | 1443 // if (byte1 > (byte) 0xF4 || |
| 1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
| 1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
| 1461 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1446 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
| 1462 // byte3 trailing-byte test | 1447 // byte3 trailing-byte test |
| 1463 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF | 1448 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF |
| 1464 // byte4 trailing-byte test | 1449 // byte4 trailing-byte test |
| 1465 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1450 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
| 1466 return MALFORMED; | 1451 return MALFORMED; |
| 1467 } | 1452 } |
| 1468 } | 1453 } |
| 1469 } | 1454 } |
| 1470 } | 1455 } |
| 1471 | 1456 |
| 1472 private static int partialIsValidUtf8(long address, int remaining) { | 1457 private static int partialIsValidUtf8(long address, int remaining) { |
| 1473 // Skip past ASCII characters as quickly as possible. | 1458 // Skip past ASCII characters as quickly as possible. |
| 1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); | 1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); |
| 1475 address += skipped; | 1460 address += skipped; |
| 1476 remaining -= skipped; | 1461 remaining -= skipped; |
| 1477 | 1462 |
| 1478 for (;;) { | 1463 for (;;) { |
| 1479 // Optimize for interior runs of ASCII bytes. | 1464 // Optimize for interior runs of ASCII bytes. |
| 1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? | 1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? |
| 1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? | 1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? |
| 1482 int byte1 = 0; | 1467 int byte1 = 0; |
| 1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --rema
ining) { | 1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --
remaining) { |
| 1484 } | 1469 } |
| 1485 if (remaining == 0) { | 1470 if (remaining == 0) { |
| 1486 return COMPLETE; | 1471 return COMPLETE; |
| 1487 } | 1472 } |
| 1488 remaining--; | 1473 remaining--; |
| 1489 | 1474 |
| 1490 if (byte1 < (byte) 0xE0) { | 1475 if (byte1 < (byte) 0xE0) { |
| 1491 // Two-byte form | 1476 // Two-byte form |
| 1492 | 1477 |
| 1493 if (remaining == 0) { | 1478 if (remaining == 0) { |
| 1494 // Incomplete sequence | 1479 // Incomplete sequence |
| 1495 return byte1; | 1480 return byte1; |
| 1496 } | 1481 } |
| 1497 remaining--; | 1482 remaining--; |
| 1498 | 1483 |
| 1499 // Simultaneously checks for illegal trailing-byte in | 1484 // Simultaneously checks for illegal trailing-byte in |
| 1500 // leading position and overlong 2-byte form. | 1485 // leading position and overlong 2-byte form. |
| 1501 if (byte1 < (byte) 0xC2 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1486 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF
) { |
| 1502 return MALFORMED; | 1487 return MALFORMED; |
| 1503 } | 1488 } |
| 1504 } else if (byte1 < (byte) 0xF0) { | 1489 } else if (byte1 < (byte) 0xF0) { |
| 1505 // Three-byte form | 1490 // Three-byte form |
| 1506 | 1491 |
| 1507 if (remaining < 2) { | 1492 if (remaining < 2) { |
| 1508 // Incomplete sequence | 1493 // Incomplete sequence |
| 1509 return unsafeIncompleteStateFor(address, byte1, remaining); | 1494 return unsafeIncompleteStateFor(address, byte1, remaining); |
| 1510 } | 1495 } |
| 1511 remaining -= 2; | 1496 remaining -= 2; |
| 1512 | 1497 |
| 1513 final byte byte2 = UNSAFE.getByte(address++); | 1498 final byte byte2 = UnsafeUtil.getByte(address++); |
| 1514 if (byte2 > (byte) 0xBF | 1499 if (byte2 > (byte) 0xBF |
| 1515 // overlong? 5 most significant bits must not all be zero | 1500 // overlong? 5 most significant bits must not all be zero |
| 1516 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1501 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
| 1517 // check for illegal surrogate codepoints | 1502 // check for illegal surrogate codepoints |
| 1518 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1503 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
| 1519 // byte3 trailing-byte test | 1504 // byte3 trailing-byte test |
| 1520 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1505 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
| 1521 return MALFORMED; | 1506 return MALFORMED; |
| 1522 } | 1507 } |
| 1523 } else { | 1508 } else { |
| 1524 // Four-byte form | 1509 // Four-byte form |
| 1525 | 1510 |
| 1526 if (remaining < 3) { | 1511 if (remaining < 3) { |
| 1527 // Incomplete sequence | 1512 // Incomplete sequence |
| 1528 return unsafeIncompleteStateFor(address, byte1, remaining); | 1513 return unsafeIncompleteStateFor(address, byte1, remaining); |
| 1529 } | 1514 } |
| 1530 remaining -= 3; | 1515 remaining -= 3; |
| 1531 | 1516 |
| 1532 final byte byte2 = UNSAFE.getByte(address++); | 1517 final byte byte2 = UnsafeUtil.getByte(address++); |
| 1533 if (byte2 > (byte) 0xBF | 1518 if (byte2 > (byte) 0xBF |
| 1534 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1519 // Check that 1 <= plane <= 16. Tricky optimized form of: |
| 1535 // if (byte1 > (byte) 0xF4 || | 1520 // if (byte1 > (byte) 0xF4 || |
| 1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
| 1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
| 1538 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1523 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
| 1539 // byte3 trailing-byte test | 1524 // byte3 trailing-byte test |
| 1540 || UNSAFE.getByte(address++) > (byte) 0xBF | 1525 || UnsafeUtil.getByte(address++) > (byte) 0xBF |
| 1541 // byte4 trailing-byte test | 1526 // byte4 trailing-byte test |
| 1542 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1527 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
| 1543 return MALFORMED; | 1528 return MALFORMED; |
| 1544 } | 1529 } |
| 1545 } | 1530 } |
| 1546 } | 1531 } |
| 1547 } | 1532 } |
| 1548 | 1533 |
| 1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of
fset, | 1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of
fset, |
| 1550 int remaining) { | 1535 int remaining) { |
| 1551 switch (remaining) { | 1536 switch (remaining) { |
| 1552 case 0: { | 1537 case 0: { |
| 1553 return incompleteStateFor(byte1); | 1538 return incompleteStateFor(byte1); |
| 1554 } | 1539 } |
| 1555 case 1: { | 1540 case 1: { |
| 1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset)); | 1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); |
| 1557 } | 1542 } |
| 1558 case 2: { | 1543 case 2: { |
| 1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset), | 1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset), |
| 1560 UNSAFE.getByte(bytes, offset + 1)); | 1545 UnsafeUtil.getByte(bytes, offset + 1)); |
| 1561 } | 1546 } |
| 1562 default: { | 1547 default: { |
| 1563 throw new AssertionError(); | 1548 throw new AssertionError(); |
| 1564 } | 1549 } |
| 1565 } | 1550 } |
| 1566 } | 1551 } |
| 1567 | 1552 |
| 1568 private static int unsafeIncompleteStateFor(long address, final int byte1, i
nt remaining) { | 1553 private static int unsafeIncompleteStateFor(long address, final int byte1, i
nt remaining) { |
| 1569 switch (remaining) { | 1554 switch (remaining) { |
| 1570 case 0: { | 1555 case 0: { |
| 1571 return incompleteStateFor(byte1); | 1556 return incompleteStateFor(byte1); |
| 1572 } | 1557 } |
| 1573 case 1: { | 1558 case 1: { |
| 1574 return incompleteStateFor(byte1, UNSAFE.getByte(address)); | 1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); |
| 1575 } | 1560 } |
| 1576 case 2: { | 1561 case 2: { |
| 1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getBy
te(address + 1)); | 1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address), |
| 1563 UnsafeUtil.getByte(address + 1)); |
| 1578 } | 1564 } |
| 1579 default: { | 1565 default: { |
| 1580 throw new AssertionError(); | 1566 throw new AssertionError(); |
| 1581 } | 1567 } |
| 1582 } | 1568 } |
| 1583 } | 1569 } |
| 1584 | |
| 1585 /** | |
| 1586 * Gets the field with the given name within the class, or {@code null} if n
ot found. If | |
| 1587 * found, the field is made accessible. | |
| 1588 */ | |
| 1589 private static Field field(Class<?> clazz, String fieldName) { | |
| 1590 Field field; | |
| 1591 try { | |
| 1592 field = clazz.getDeclaredField(fieldName); | |
| 1593 field.setAccessible(true); | |
| 1594 } catch (Throwable t) { | |
| 1595 // Failed to access the fields. | |
| 1596 field = null; | |
| 1597 } | |
| 1598 logger.log(Level.FINEST, "{0}.{1}: {2}", | |
| 1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available"
: "unavailable")}); | |
| 1600 return field; | |
| 1601 } | |
| 1602 | |
| 1603 /** | |
| 1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.mis
c.Unsafe} is not | |
| 1605 * available. | |
| 1606 */ | |
| 1607 private static long fieldOffset(Field field) { | |
| 1608 return field == null || UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(fie
ld); | |
| 1609 } | |
| 1610 | |
| 1611 /** | |
| 1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Uns
afe} is not | |
| 1613 * available. | |
| 1614 */ | |
| 1615 private static <T> int byteArrayBaseOffset() { | |
| 1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class); | |
| 1617 } | |
| 1618 | |
| 1619 /** | |
| 1620 * Gets the offset of the {@code address} field of the given direct {@link B
yteBuffer}. | |
| 1621 */ | |
| 1622 private static long addressOffset(ByteBuffer buffer) { | |
| 1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET); | |
| 1624 } | |
| 1625 | |
| 1626 /** | |
| 1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not availab
le on this | |
| 1628 * platform. | |
| 1629 */ | |
| 1630 private static sun.misc.Unsafe getUnsafe() { | |
| 1631 sun.misc.Unsafe unsafe = null; | |
| 1632 try { | |
| 1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun
.misc.Unsafe>() { | |
| 1634 @Override | |
| 1635 public sun.misc.Unsafe run() throws Exception { | |
| 1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class; | |
| 1637 | |
| 1638 // Check that this platform supports all of the required unsafe meth
ods. | |
| 1639 checkRequiredMethods(k); | |
| 1640 | |
| 1641 for (Field f : k.getDeclaredFields()) { | |
| 1642 f.setAccessible(true); | |
| 1643 Object x = f.get(null); | |
| 1644 if (k.isInstance(x)) { | |
| 1645 return k.cast(x); | |
| 1646 } | |
| 1647 } | |
| 1648 // The sun.misc.Unsafe field does not exist. | |
| 1649 return null; | |
| 1650 } | |
| 1651 }); | |
| 1652 } catch (Throwable e) { | |
| 1653 // Catching Throwable here due to the fact that Google AppEngine raises
NoClassDefFoundError | |
| 1654 // for Unsafe. | |
| 1655 } | |
| 1656 | |
| 1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}", | |
| 1658 unsafe != null ? "available" : "unavailable"); | |
| 1659 return unsafe; | |
| 1660 } | |
| 1661 | |
| 1662 /** | |
| 1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are availab
le on this platform. | |
| 1664 */ | |
| 1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz) | |
| 1666 throws NoSuchMethodException, SecurityException { | |
| 1667 // Needed for Unsafe byte[] access | |
| 1668 clazz.getMethod("arrayBaseOffset", Class.class); | |
| 1669 clazz.getMethod("getByte", Object.class, long.class); | |
| 1670 clazz.getMethod("putByte", Object.class, long.class, byte.class); | |
| 1671 clazz.getMethod("getLong", Object.class, long.class); | |
| 1672 | |
| 1673 // Needed for Unsafe Direct ByteBuffer access | |
| 1674 clazz.getMethod("objectFieldOffset", Field.class); | |
| 1675 clazz.getMethod("getByte", long.class); | |
| 1676 clazz.getMethod("getLong", Object.class, long.class); | |
| 1677 clazz.getMethod("putByte", long.class, byte.class); | |
| 1678 clazz.getMethod("getLong", long.class); | |
| 1679 } | |
| 1680 } | 1570 } |
| 1681 | 1571 |
| 1682 private Utf8() {} | 1572 private Utf8() {} |
| 1683 } | 1573 } |
| OLD | NEW |