OLD | NEW |
1 // Protocol Buffers - Google's data interchange format | 1 // Protocol Buffers - Google's data interchange format |
2 // Copyright 2008 Google Inc. All rights reserved. | 2 // Copyright 2008 Google Inc. All rights reserved. |
3 // https://developers.google.com/protocol-buffers/ | 3 // https://developers.google.com/protocol-buffers/ |
4 // | 4 // |
5 // Redistribution and use in source and binary forms, with or without | 5 // Redistribution and use in source and binary forms, with or without |
6 // modification, are permitted provided that the following conditions are | 6 // modification, are permitted provided that the following conditions are |
7 // met: | 7 // met: |
8 // | 8 // |
9 // * Redistributions of source code must retain the above copyright | 9 // * Redistributions of source code must retain the above copyright |
10 // notice, this list of conditions and the following disclaimer. | 10 // notice, this list of conditions and the following disclaimer. |
(...skipping 12 matching lines...) Expand all Loading... |
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | 30 |
31 package com.google.protobuf; | 31 package com.google.protobuf; |
32 | 32 |
| 33 import static com.google.protobuf.UnsafeUtil.addressOffset; |
| 34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset; |
| 35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations; |
| 36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations; |
33 import static java.lang.Character.MAX_SURROGATE; | 37 import static java.lang.Character.MAX_SURROGATE; |
34 import static java.lang.Character.MIN_SURROGATE; | 38 import static java.lang.Character.MIN_SURROGATE; |
35 import static java.lang.Character.isSurrogatePair; | 39 import static java.lang.Character.isSurrogatePair; |
36 import static java.lang.Character.toCodePoint; | 40 import static java.lang.Character.toCodePoint; |
37 | 41 |
38 import java.lang.reflect.Field; | |
39 import java.nio.Buffer; | |
40 import java.nio.ByteBuffer; | 42 import java.nio.ByteBuffer; |
41 import java.security.AccessController; | |
42 import java.security.PrivilegedExceptionAction; | |
43 import java.util.logging.Level; | |
44 import java.util.logging.Logger; | |
45 | 43 |
46 /** | 44 /** |
47 * A set of low-level, high-performance static utility methods related | 45 * A set of low-level, high-performance static utility methods related |
48 * to the UTF-8 character encoding. This class has no dependencies | 46 * to the UTF-8 character encoding. This class has no dependencies |
49 * outside of the core JDK libraries. | 47 * outside of the core JDK libraries. |
50 * | 48 * |
51 * <p>There are several variants of UTF-8. The one implemented by | 49 * <p>There are several variants of UTF-8. The one implemented by |
52 * this class is the restricted definition of UTF-8 introduced in | 50 * this class is the restricted definition of UTF-8 introduced in |
53 * Unicode 3.1, which mandates the rejection of "overlong" byte | 51 * Unicode 3.1, which mandates the rejection of "overlong" byte |
54 * sequences as well as rejection of 3-byte surrogate codepoint byte | 52 * sequences as well as rejection of 3-byte surrogate codepoint byte |
(...skipping 17 matching lines...) Expand all Loading... |
72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is | 70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is |
73 * well-formed in the absence of additional input, or if the byte sequence | 71 * well-formed in the absence of additional input, or if the byte sequence |
74 * apparently terminated in the middle of a character, an opaque integer | 72 * apparently terminated in the middle of a character, an opaque integer |
75 * "state" value containing enough information to decode the character when | 73 * "state" value containing enough information to decode the character when |
76 * passed to a subsequent invocation of a partial decoding method. | 74 * passed to a subsequent invocation of a partial decoding method. |
77 * | 75 * |
78 * @author martinrb@google.com (Martin Buchholz) | 76 * @author martinrb@google.com (Martin Buchholz) |
79 */ | 77 */ |
80 // TODO(nathanmittler): Copy changes in this class back to Guava | 78 // TODO(nathanmittler): Copy changes in this class back to Guava |
81 final class Utf8 { | 79 final class Utf8 { |
82 private static final Logger logger = Logger.getLogger(Utf8.class.getName()); | |
83 | 80 |
84 /** | 81 /** |
85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl
ementations | 82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl
ementations |
86 * depending on what is available on the platform. The processor is the platfo
rm-optimized | 83 * depending on what is available on the platform. The processor is the platfo
rm-optimized |
87 * delegate for which all methods are delegated directly to. | 84 * delegate for which all methods are delegated directly to. |
88 */ | 85 */ |
89 private static final Processor processor = | 86 private static final Processor processor = |
90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(
); | 87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(
); |
91 | 88 |
92 /** | 89 /** |
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
230 default: | 227 default: |
231 throw new AssertionError(); | 228 throw new AssertionError(); |
232 } | 229 } |
233 } | 230 } |
234 | 231 |
235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi
fication to throw | 232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi
fication to throw |
236 // a protocol buffer local exception. This exception is then caught in CodedOu
tputStream so it can | 233 // a protocol buffer local exception. This exception is then caught in CodedOu
tputStream so it can |
237 // fallback to more lenient behavior. | 234 // fallback to more lenient behavior. |
238 | 235 |
239 static class UnpairedSurrogateException extends IllegalArgumentException { | 236 static class UnpairedSurrogateException extends IllegalArgumentException { |
240 private UnpairedSurrogateException(int index, int length) { | 237 UnpairedSurrogateException(int index, int length) { |
241 super("Unpaired surrogate at index " + index + " of " + length); | 238 super("Unpaired surrogate at index " + index + " of " + length); |
242 } | 239 } |
243 } | 240 } |
244 | 241 |
245 /** | 242 /** |
246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
For a string, | 243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
For a string, |
247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is
more efficient in | 244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is
more efficient in |
248 * both time and space. | 245 * both time and space. |
249 * | 246 * |
250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT
F-16 (unpaired | 247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT
F-16 (unpaired |
(...skipping 733 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
984 } | 981 } |
985 } | 982 } |
986 } | 983 } |
987 } | 984 } |
988 } | 985 } |
989 | 986 |
990 /** | 987 /** |
991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro
ve performance. | 988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro
ve performance. |
992 */ | 989 */ |
993 static final class UnsafeProcessor extends Processor { | 990 static final class UnsafeProcessor extends Processor { |
994 private static final sun.misc.Unsafe UNSAFE = getUnsafe(); | |
995 private static final long BUFFER_ADDRESS_OFFSET = | |
996 fieldOffset(field(Buffer.class, "address")); | |
997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset(); | |
998 | |
999 /** | |
1000 * We only use Unsafe operations if we have access to direct {@link ByteBuff
er}'s address | |
1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong())
. | |
1002 */ | |
1003 private static final boolean AVAILABLE = | |
1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0; | |
1005 | |
1006 /** | 991 /** |
1007 * Indicates whether or not all required unsafe operations are supported on
this platform. | 992 * Indicates whether or not all required unsafe operations are supported on
this platform. |
1008 */ | 993 */ |
1009 static boolean isAvailable() { | 994 static boolean isAvailable() { |
1010 return AVAILABLE; | 995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); |
1011 } | 996 } |
1012 | 997 |
1013 @Override | 998 @Override |
1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l
imit) { | 999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l
imit) { |
1015 if ((index | limit | bytes.length - limit) < 0) { | 1000 if ((index | limit | bytes.length - limit) < 0) { |
1016 throw new ArrayIndexOutOfBoundsException( | 1001 throw new ArrayIndexOutOfBoundsException( |
1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i
ndex, limit)); | 1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i
ndex, limit)); |
1018 } | 1003 } |
1019 long offset = ARRAY_BASE_OFFSET + index; | 1004 long offset = getArrayBaseOffset() + index; |
1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit; | 1005 final long offsetLimit = getArrayBaseOffset() + limit; |
1021 if (state != COMPLETE) { | 1006 if (state != COMPLETE) { |
1022 // The previous decoding operation was incomplete (or malformed). | 1007 // The previous decoding operation was incomplete (or malformed). |
1023 // We look for a well-formed sequence consisting of bytes from | 1008 // We look for a well-formed sequence consisting of bytes from |
1024 // the previous decoding operation (stored in state) together | 1009 // the previous decoding operation (stored in state) together |
1025 // with bytes from the array slice. | 1010 // with bytes from the array slice. |
1026 // | 1011 // |
1027 // We expect such "straddler characters" to be rare. | 1012 // We expect such "straddler characters" to be rare. |
1028 | 1013 |
1029 if (offset >= offsetLimit) { // No bytes? No progress. | 1014 if (offset >= offsetLimit) { // No bytes? No progress. |
1030 return state; | 1015 return state; |
1031 } | 1016 } |
1032 int byte1 = (byte) state; | 1017 int byte1 = (byte) state; |
1033 // byte1 is never ASCII. | 1018 // byte1 is never ASCII. |
1034 if (byte1 < (byte) 0xE0) { | 1019 if (byte1 < (byte) 0xE0) { |
1035 // two-byte form | 1020 // two-byte form |
1036 | 1021 |
1037 // Simultaneously checks for illegal trailing-byte in | 1022 // Simultaneously checks for illegal trailing-byte in |
1038 // leading position and overlong 2-byte form. | 1023 // leading position and overlong 2-byte form. |
1039 if (byte1 < (byte) 0xC2 | 1024 if (byte1 < (byte) 0xC2 |
1040 // byte2 trailing-byte test | 1025 // byte2 trailing-byte test |
1041 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1026 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
1042 return MALFORMED; | 1027 return MALFORMED; |
1043 } | 1028 } |
1044 } else if (byte1 < (byte) 0xF0) { | 1029 } else if (byte1 < (byte) 0xF0) { |
1045 // three-byte form | 1030 // three-byte form |
1046 | 1031 |
1047 // Get byte2 from saved state or array | 1032 // Get byte2 from saved state or array |
1048 int byte2 = (byte) ~(state >> 8); | 1033 int byte2 = (byte) ~(state >> 8); |
1049 if (byte2 == 0) { | 1034 if (byte2 == 0) { |
1050 byte2 = UNSAFE.getByte(bytes, offset++); | 1035 byte2 = UnsafeUtil.getByte(bytes, offset++); |
1051 if (offset >= offsetLimit) { | 1036 if (offset >= offsetLimit) { |
1052 return incompleteStateFor(byte1, byte2); | 1037 return incompleteStateFor(byte1, byte2); |
1053 } | 1038 } |
1054 } | 1039 } |
1055 if (byte2 > (byte) 0xBF | 1040 if (byte2 > (byte) 0xBF |
1056 // overlong? 5 most significant bits must not all be zero | 1041 // overlong? 5 most significant bits must not all be zero |
1057 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1042 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1058 // illegal surrogate codepoint? | 1043 // illegal surrogate codepoint? |
1059 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1044 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1060 // byte3 trailing-byte test | 1045 // byte3 trailing-byte test |
1061 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1046 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
1062 return MALFORMED; | 1047 return MALFORMED; |
1063 } | 1048 } |
1064 } else { | 1049 } else { |
1065 // four-byte form | 1050 // four-byte form |
1066 | 1051 |
1067 // Get byte2 and byte3 from saved state or array | 1052 // Get byte2 and byte3 from saved state or array |
1068 int byte2 = (byte) ~(state >> 8); | 1053 int byte2 = (byte) ~(state >> 8); |
1069 int byte3 = 0; | 1054 int byte3 = 0; |
1070 if (byte2 == 0) { | 1055 if (byte2 == 0) { |
1071 byte2 = UNSAFE.getByte(bytes, offset++); | 1056 byte2 = UnsafeUtil.getByte(bytes, offset++); |
1072 if (offset >= offsetLimit) { | 1057 if (offset >= offsetLimit) { |
1073 return incompleteStateFor(byte1, byte2); | 1058 return incompleteStateFor(byte1, byte2); |
1074 } | 1059 } |
1075 } else { | 1060 } else { |
1076 byte3 = (byte) (state >> 16); | 1061 byte3 = (byte) (state >> 16); |
1077 } | 1062 } |
1078 if (byte3 == 0) { | 1063 if (byte3 == 0) { |
1079 byte3 = UNSAFE.getByte(bytes, offset++); | 1064 byte3 = UnsafeUtil.getByte(bytes, offset++); |
1080 if (offset >= offsetLimit) { | 1065 if (offset >= offsetLimit) { |
1081 return incompleteStateFor(byte1, byte2, byte3); | 1066 return incompleteStateFor(byte1, byte2, byte3); |
1082 } | 1067 } |
1083 } | 1068 } |
1084 | 1069 |
1085 // If we were called with state == MALFORMED, then byte1 is 0xFF, | 1070 // If we were called with state == MALFORMED, then byte1 is 0xFF, |
1086 // which never occurs in well-formed UTF-8, and so we will return | 1071 // which never occurs in well-formed UTF-8, and so we will return |
1087 // MALFORMED again below. | 1072 // MALFORMED again below. |
1088 | 1073 |
1089 if (byte2 > (byte) 0xBF | 1074 if (byte2 > (byte) 0xBF |
1090 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1075 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1091 // if (byte1 > (byte) 0xF4 || | 1076 // if (byte1 > (byte) 0xF4 || |
1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1094 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1079 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1095 // byte3 trailing-byte test | 1080 // byte3 trailing-byte test |
1096 || byte3 > (byte) 0xBF | 1081 || byte3 > (byte) 0xBF |
1097 // byte4 trailing-byte test | 1082 // byte4 trailing-byte test |
1098 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1083 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
1099 return MALFORMED; | 1084 return MALFORMED; |
1100 } | 1085 } |
1101 } | 1086 } |
1102 } | 1087 } |
1103 | 1088 |
1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); | 1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); |
1105 } | 1090 } |
1106 | 1091 |
1107 @Override | 1092 @Override |
1108 int partialIsValidUtf8Direct( | 1093 int partialIsValidUtf8Direct( |
(...skipping 18 matching lines...) Expand all Loading... |
1127 | 1112 |
1128 final int byte1 = (byte) state; | 1113 final int byte1 = (byte) state; |
1129 // byte1 is never ASCII. | 1114 // byte1 is never ASCII. |
1130 if (byte1 < (byte) 0xE0) { | 1115 if (byte1 < (byte) 0xE0) { |
1131 // two-byte form | 1116 // two-byte form |
1132 | 1117 |
1133 // Simultaneously checks for illegal trailing-byte in | 1118 // Simultaneously checks for illegal trailing-byte in |
1134 // leading position and overlong 2-byte form. | 1119 // leading position and overlong 2-byte form. |
1135 if (byte1 < (byte) 0xC2 | 1120 if (byte1 < (byte) 0xC2 |
1136 // byte2 trailing-byte test | 1121 // byte2 trailing-byte test |
1137 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1122 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
1138 return MALFORMED; | 1123 return MALFORMED; |
1139 } | 1124 } |
1140 } else if (byte1 < (byte) 0xF0) { | 1125 } else if (byte1 < (byte) 0xF0) { |
1141 // three-byte form | 1126 // three-byte form |
1142 | 1127 |
1143 // Get byte2 from saved state or array | 1128 // Get byte2 from saved state or array |
1144 int byte2 = (byte) ~(state >> 8); | 1129 int byte2 = (byte) ~(state >> 8); |
1145 if (byte2 == 0) { | 1130 if (byte2 == 0) { |
1146 byte2 = UNSAFE.getByte(address++); | 1131 byte2 = UnsafeUtil.getByte(address++); |
1147 if (address >= addressLimit) { | 1132 if (address >= addressLimit) { |
1148 return incompleteStateFor(byte1, byte2); | 1133 return incompleteStateFor(byte1, byte2); |
1149 } | 1134 } |
1150 } | 1135 } |
1151 if (byte2 > (byte) 0xBF | 1136 if (byte2 > (byte) 0xBF |
1152 // overlong? 5 most significant bits must not all be zero | 1137 // overlong? 5 most significant bits must not all be zero |
1153 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1138 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1154 // illegal surrogate codepoint? | 1139 // illegal surrogate codepoint? |
1155 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1140 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1156 // byte3 trailing-byte test | 1141 // byte3 trailing-byte test |
1157 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1142 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
1158 return MALFORMED; | 1143 return MALFORMED; |
1159 } | 1144 } |
1160 } else { | 1145 } else { |
1161 // four-byte form | 1146 // four-byte form |
1162 | 1147 |
1163 // Get byte2 and byte3 from saved state or array | 1148 // Get byte2 and byte3 from saved state or array |
1164 int byte2 = (byte) ~(state >> 8); | 1149 int byte2 = (byte) ~(state >> 8); |
1165 int byte3 = 0; | 1150 int byte3 = 0; |
1166 if (byte2 == 0) { | 1151 if (byte2 == 0) { |
1167 byte2 = UNSAFE.getByte(address++); | 1152 byte2 = UnsafeUtil.getByte(address++); |
1168 if (address >= addressLimit) { | 1153 if (address >= addressLimit) { |
1169 return incompleteStateFor(byte1, byte2); | 1154 return incompleteStateFor(byte1, byte2); |
1170 } | 1155 } |
1171 } else { | 1156 } else { |
1172 byte3 = (byte) (state >> 16); | 1157 byte3 = (byte) (state >> 16); |
1173 } | 1158 } |
1174 if (byte3 == 0) { | 1159 if (byte3 == 0) { |
1175 byte3 = UNSAFE.getByte(address++); | 1160 byte3 = UnsafeUtil.getByte(address++); |
1176 if (address >= addressLimit) { | 1161 if (address >= addressLimit) { |
1177 return incompleteStateFor(byte1, byte2, byte3); | 1162 return incompleteStateFor(byte1, byte2, byte3); |
1178 } | 1163 } |
1179 } | 1164 } |
1180 | 1165 |
1181 // If we were called with state == MALFORMED, then byte1 is 0xFF, | 1166 // If we were called with state == MALFORMED, then byte1 is 0xFF, |
1182 // which never occurs in well-formed UTF-8, and so we will return | 1167 // which never occurs in well-formed UTF-8, and so we will return |
1183 // MALFORMED again below. | 1168 // MALFORMED again below. |
1184 | 1169 |
1185 if (byte2 > (byte) 0xBF | 1170 if (byte2 > (byte) 0xBF |
1186 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1171 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1187 // if (byte1 > (byte) 0xF4 || | 1172 // if (byte1 > (byte) 0xF4 || |
1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1190 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1175 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1191 // byte3 trailing-byte test | 1176 // byte3 trailing-byte test |
1192 || byte3 > (byte) 0xBF | 1177 || byte3 > (byte) 0xBF |
1193 // byte4 trailing-byte test | 1178 // byte4 trailing-byte test |
1194 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1179 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
1195 return MALFORMED; | 1180 return MALFORMED; |
1196 } | 1181 } |
1197 } | 1182 } |
1198 } | 1183 } |
1199 | 1184 |
1200 return partialIsValidUtf8(address, (int) (addressLimit - address)); | 1185 return partialIsValidUtf8(address, (int) (addressLimit - address)); |
1201 } | 1186 } |
1202 | 1187 |
1203 @Override | 1188 @Override |
1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi
nal int length) { | 1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi
nal int length) { |
1205 long outIx = ARRAY_BASE_OFFSET + offset; | 1190 long outIx = getArrayBaseOffset() + offset; |
1206 final long outLimit = outIx + length; | 1191 final long outLimit = outIx + length; |
1207 final int inLimit = in.length(); | 1192 final int inLimit = in.length(); |
1208 if (inLimit > length || out.length - length < offset) { | 1193 if (inLimit > length || out.length - length < offset) { |
1209 // Not even enough room for an ASCII-encoded string. | 1194 // Not even enough room for an ASCII-encoded string. |
1210 throw new ArrayIndexOutOfBoundsException( | 1195 throw new ArrayIndexOutOfBoundsException( |
1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset
+ length)); | 1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset
+ length)); |
1212 } | 1197 } |
1213 | 1198 |
1214 // Designed to take advantage of | 1199 // Designed to take advantage of |
1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination | 1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination |
1216 int inIx = 0; | 1201 int inIx = 0; |
1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { | 1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
1218 UNSAFE.putByte(out, outIx++, (byte) c); | 1203 UnsafeUtil.putByte(out, outIx++, (byte) c); |
1219 } | 1204 } |
1220 if (inIx == inLimit) { | 1205 if (inIx == inLimit) { |
1221 // We're done, it was ASCII encoded. | 1206 // We're done, it was ASCII encoded. |
1222 return (int) (outIx - ARRAY_BASE_OFFSET); | 1207 return (int) (outIx - getArrayBaseOffset()); |
1223 } | 1208 } |
1224 | 1209 |
1225 for (char c; inIx < inLimit; ++inIx) { | 1210 for (char c; inIx < inLimit; ++inIx) { |
1226 c = in.charAt(inIx); | 1211 c = in.charAt(inIx); |
1227 if (c < 0x80 && outIx < outLimit) { | 1212 if (c < 0x80 && outIx < outLimit) { |
1228 UNSAFE.putByte(out, outIx++, (byte) c); | 1213 UnsafeUtil.putByte(out, outIx++, (byte) c); |
1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes | 1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes |
1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); | 1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); |
1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); | 1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); |
1232 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { | 1217 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { |
1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s | 1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s |
1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); | 1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); |
1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); | 1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); | 1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); |
1237 } else if (outIx <= outLimit - 4L) { | 1222 } else if (outIx <= outLimit - 4L) { |
1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 | 1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 |
1239 // bytes | 1224 // bytes |
1240 final char low; | 1225 final char low; |
1241 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { | 1226 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { |
1242 throw new UnpairedSurrogateException((inIx - 1), inLimit); | 1227 throw new UnpairedSurrogateException((inIx - 1), inLimit); |
1243 } | 1228 } |
1244 int codePoint = toCodePoint(c, low); | 1229 int codePoint = toCodePoint(c, low); |
1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)))
; | 1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 1
8))); |
1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)
))); | 1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>>
12)))); |
1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))
)); | 1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>>
6)))); |
1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); | 1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); |
1249 } else { | 1234 } else { |
1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) | 1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) |
1251 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { | 1236 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { |
1252 // We are surrogates and we're not a surrogate pair. | 1237 // We are surrogates and we're not a surrogate pair. |
1253 throw new UnpairedSurrogateException(inIx, inLimit); | 1238 throw new UnpairedSurrogateException(inIx, inLimit); |
1254 } | 1239 } |
1255 // Not enough space in the output buffer. | 1240 // Not enough space in the output buffer. |
1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); | 1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); |
1257 } | 1242 } |
1258 } | 1243 } |
1259 | 1244 |
1260 // All bytes have been encoded. | 1245 // All bytes have been encoded. |
1261 return (int) (outIx - ARRAY_BASE_OFFSET); | 1246 return (int) (outIx - getArrayBaseOffset()); |
1262 } | 1247 } |
1263 | 1248 |
1264 @Override | 1249 @Override |
1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { | 1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { |
1266 final long address = addressOffset(out); | 1251 final long address = addressOffset(out); |
1267 long outIx = address + out.position(); | 1252 long outIx = address + out.position(); |
1268 final long outLimit = address + out.limit(); | 1253 final long outLimit = address + out.limit(); |
1269 final int inLimit = in.length(); | 1254 final int inLimit = in.length(); |
1270 if (inLimit > outLimit - outIx) { | 1255 if (inLimit > outLimit - outIx) { |
1271 // Not even enough room for an ASCII-encoded string. | 1256 // Not even enough room for an ASCII-encoded string. |
1272 throw new ArrayIndexOutOfBoundsException( | 1257 throw new ArrayIndexOutOfBoundsException( |
1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi
t()); | 1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi
t()); |
1274 } | 1259 } |
1275 | 1260 |
1276 // Designed to take advantage of | 1261 // Designed to take advantage of |
1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination | 1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination |
1278 int inIx = 0; | 1263 int inIx = 0; |
1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { | 1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
1280 UNSAFE.putByte(outIx++, (byte) c); | 1265 UnsafeUtil.putByte(outIx++, (byte) c); |
1281 } | 1266 } |
1282 if (inIx == inLimit) { | 1267 if (inIx == inLimit) { |
1283 // We're done, it was ASCII encoded. | 1268 // We're done, it was ASCII encoded. |
1284 out.position((int) (outIx - address)); | 1269 out.position((int) (outIx - address)); |
1285 return; | 1270 return; |
1286 } | 1271 } |
1287 | 1272 |
1288 for (char c; inIx < inLimit; ++inIx) { | 1273 for (char c; inIx < inLimit; ++inIx) { |
1289 c = in.charAt(inIx); | 1274 c = in.charAt(inIx); |
1290 if (c < 0x80 && outIx < outLimit) { | 1275 if (c < 0x80 && outIx < outLimit) { |
1291 UNSAFE.putByte(outIx++, (byte) c); | 1276 UnsafeUtil.putByte(outIx++, (byte) c); |
1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes | 1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes |
1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); | 1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); |
1294 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); | 1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
1295 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { | 1280 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { |
1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s | 1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s |
1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); | 1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); |
1298 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); | 1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
1299 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); | 1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
1300 } else if (outIx <= outLimit - 4L) { | 1285 } else if (outIx <= outLimit - 4L) { |
1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 | 1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 |
1302 // bytes | 1287 // bytes |
1303 final char low; | 1288 final char low; |
1304 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { | 1289 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { |
1305 throw new UnpairedSurrogateException((inIx - 1), inLimit); | 1290 throw new UnpairedSurrogateException((inIx - 1), inLimit); |
1306 } | 1291 } |
1307 int codePoint = toCodePoint(c, low); | 1292 int codePoint = toCodePoint(c, low); |
1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); | 1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); |
1309 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); | 1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))
)); |
1310 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); | 1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))
); |
1311 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); | 1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); |
1312 } else { | 1297 } else { |
1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) | 1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) |
1314 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { | 1299 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { |
1315 // We are surrogates and we're not a surrogate pair. | 1300 // We are surrogates and we're not a surrogate pair. |
1316 throw new UnpairedSurrogateException(inIx, inLimit); | 1301 throw new UnpairedSurrogateException(inIx, inLimit); |
1317 } | 1302 } |
1318 // Not enough space in the output buffer. | 1303 // Not enough space in the output buffer. |
1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); | 1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); |
1320 } | 1305 } |
1321 } | 1306 } |
(...skipping 20 matching lines...) Expand all Loading... |
1342 return 0; | 1327 return 0; |
1343 } | 1328 } |
1344 | 1329 |
1345 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. | 1330 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. |
1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to
make sure that | 1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to
make sure that |
1347 // the index (relative to the start of the array) is also 8-byte aligned.
We do this by | 1332 // the index (relative to the start of the array) is also 8-byte aligned.
We do this by |
1348 // ANDing the index with 7 to determine the number of bytes that need to b
e read before | 1333 // ANDing the index with 7 to determine the number of bytes that need to b
e read before |
1349 // we're 8-byte aligned. | 1334 // we're 8-byte aligned. |
1350 final int unaligned = (int) offset & 7; | 1335 final int unaligned = (int) offset & 7; |
1351 for (int j = unaligned; j > 0; j--) { | 1336 for (int j = unaligned; j > 0; j--) { |
1352 if (UNSAFE.getByte(bytes, offset++) < 0) { | 1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) { |
1353 return unaligned - j; | 1338 return unaligned - j; |
1354 } | 1339 } |
1355 } | 1340 } |
1356 | 1341 |
1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). | 1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). |
1358 // To speed things up further, we're reading longs instead of bytes so we
use a mask to | 1343 // To speed things up further, we're reading longs instead of bytes so we
use a mask to |
1359 // determine if any byte in the current long is non-ASCII. | 1344 // determine if any byte in the current long is non-ASCII. |
1360 remaining -= unaligned; | 1345 remaining -= unaligned; |
1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG)
== 0; | 1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_L
ONG) == 0; |
1362 offset += 8, remaining -= 8) {} | 1347 offset += 8, remaining -= 8) {} |
1363 return maxChars - remaining; | 1348 return maxChars - remaining; |
1364 } | 1349 } |
1365 | 1350 |
1366 /** | 1351 /** |
1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep
t that it uses the | 1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep
t that it uses the |
1368 * most efficient method available to the platform. | 1353 * most efficient method available to the platform. |
1369 */ | 1354 */ |
1370 private static int unsafeEstimateConsecutiveAscii(long address, final int ma
xChars) { | 1355 private static int unsafeEstimateConsecutiveAscii(long address, final int ma
xChars) { |
1371 int remaining = maxChars; | 1356 int remaining = maxChars; |
1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { | 1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { |
1373 // Don't bother with small strings. | 1358 // Don't bother with small strings. |
1374 return 0; | 1359 return 0; |
1375 } | 1360 } |
1376 | 1361 |
1377 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. | 1362 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. |
1378 // We do this by ANDing the address with 7 to determine the number of byte
s that need to | 1363 // We do this by ANDing the address with 7 to determine the number of byte
s that need to |
1379 // be read before we're 8-byte aligned. | 1364 // be read before we're 8-byte aligned. |
1380 final int unaligned = (int) address & 7; | 1365 final int unaligned = (int) address & 7; |
1381 for (int j = unaligned; j > 0; j--) { | 1366 for (int j = unaligned; j > 0; j--) { |
1382 if (UNSAFE.getByte(address++) < 0) { | 1367 if (UnsafeUtil.getByte(address++) < 0) { |
1383 return unaligned - j; | 1368 return unaligned - j; |
1384 } | 1369 } |
1385 } | 1370 } |
1386 | 1371 |
1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). | 1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). |
1388 // To speed things up further, we're reading longs instead of bytes so we
use a mask to | 1373 // To speed things up further, we're reading longs instead of bytes so we
use a mask to |
1389 // determine if any byte in the current long is non-ASCII. | 1374 // determine if any byte in the current long is non-ASCII. |
1390 remaining -= unaligned; | 1375 remaining -= unaligned; |
1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0; | 1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) =
= 0; |
1392 address += 8, remaining -= 8) {} | 1377 address += 8, remaining -= 8) {} |
1393 return maxChars - remaining; | 1378 return maxChars - remaining; |
1394 } | 1379 } |
1395 | 1380 |
1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r
emaining) { | 1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r
emaining) { |
1397 // Skip past ASCII characters as quickly as possible. | 1382 // Skip past ASCII characters as quickly as possible. |
1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin
g); | 1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin
g); |
1399 remaining -= skipped; | 1384 remaining -= skipped; |
1400 offset += skipped; | 1385 offset += skipped; |
1401 | 1386 |
1402 for (;;) { | 1387 for (;;) { |
1403 // Optimize for interior runs of ASCII bytes. | 1388 // Optimize for interior runs of ASCII bytes. |
1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? | 1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? |
1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? | 1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? |
1406 int byte1 = 0; | 1391 int byte1 = 0; |
1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0;
--remaining) { | 1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >=
0; --remaining) { |
1408 } | 1393 } |
1409 if (remaining == 0) { | 1394 if (remaining == 0) { |
1410 return COMPLETE; | 1395 return COMPLETE; |
1411 } | 1396 } |
1412 remaining--; | 1397 remaining--; |
1413 | 1398 |
1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. | 1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. |
1415 if (byte1 < (byte) 0xE0) { | 1400 if (byte1 < (byte) 0xE0) { |
1416 // Two-byte form (110xxxxx 10xxxxxx) | 1401 // Two-byte form (110xxxxx 10xxxxxx) |
1417 if (remaining == 0) { | 1402 if (remaining == 0) { |
1418 // Incomplete sequence | 1403 // Incomplete sequence |
1419 return byte1; | 1404 return byte1; |
1420 } | 1405 } |
1421 remaining--; | 1406 remaining--; |
1422 | 1407 |
1423 // Simultaneously checks for illegal trailing-byte in | 1408 // Simultaneously checks for illegal trailing-byte in |
1424 // leading position and overlong 2-byte form. | 1409 // leading position and overlong 2-byte form. |
1425 if (byte1 < (byte) 0xC2 | 1410 if (byte1 < (byte) 0xC2 |
1426 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1411 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
1427 return MALFORMED; | 1412 return MALFORMED; |
1428 } | 1413 } |
1429 } else if (byte1 < (byte) 0xF0) { | 1414 } else if (byte1 < (byte) 0xF0) { |
1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) | 1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) |
1431 if (remaining < 2) { | 1416 if (remaining < 2) { |
1432 // Incomplete sequence | 1417 // Incomplete sequence |
1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); | 1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); |
1434 } | 1419 } |
1435 remaining -= 2; | 1420 remaining -= 2; |
1436 | 1421 |
1437 final int byte2; | 1422 final int byte2; |
1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF | 1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF |
1439 // overlong? 5 most significant bits must not all be zero | 1424 // overlong? 5 most significant bits must not all be zero |
1440 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1425 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1441 // check for illegal surrogate codepoints | 1426 // check for illegal surrogate codepoints |
1442 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1427 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1443 // byte3 trailing-byte test | 1428 // byte3 trailing-byte test |
1444 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1429 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
1445 return MALFORMED; | 1430 return MALFORMED; |
1446 } | 1431 } |
1447 } else { | 1432 } else { |
1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) | 1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) |
1449 if (remaining < 3) { | 1434 if (remaining < 3) { |
1450 // Incomplete sequence | 1435 // Incomplete sequence |
1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); | 1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); |
1452 } | 1437 } |
1453 remaining -= 3; | 1438 remaining -= 3; |
1454 | 1439 |
1455 final int byte2; | 1440 final int byte2; |
1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF | 1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF |
1457 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1442 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1458 // if (byte1 > (byte) 0xF4 || | 1443 // if (byte1 > (byte) 0xF4 || |
1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1461 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1446 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1462 // byte3 trailing-byte test | 1447 // byte3 trailing-byte test |
1463 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF | 1448 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF |
1464 // byte4 trailing-byte test | 1449 // byte4 trailing-byte test |
1465 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { | 1450 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { |
1466 return MALFORMED; | 1451 return MALFORMED; |
1467 } | 1452 } |
1468 } | 1453 } |
1469 } | 1454 } |
1470 } | 1455 } |
1471 | 1456 |
1472 private static int partialIsValidUtf8(long address, int remaining) { | 1457 private static int partialIsValidUtf8(long address, int remaining) { |
1473 // Skip past ASCII characters as quickly as possible. | 1458 // Skip past ASCII characters as quickly as possible. |
1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); | 1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); |
1475 address += skipped; | 1460 address += skipped; |
1476 remaining -= skipped; | 1461 remaining -= skipped; |
1477 | 1462 |
1478 for (;;) { | 1463 for (;;) { |
1479 // Optimize for interior runs of ASCII bytes. | 1464 // Optimize for interior runs of ASCII bytes. |
1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? | 1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? |
1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? | 1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? |
1482 int byte1 = 0; | 1467 int byte1 = 0; |
1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --rema
ining) { | 1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --
remaining) { |
1484 } | 1469 } |
1485 if (remaining == 0) { | 1470 if (remaining == 0) { |
1486 return COMPLETE; | 1471 return COMPLETE; |
1487 } | 1472 } |
1488 remaining--; | 1473 remaining--; |
1489 | 1474 |
1490 if (byte1 < (byte) 0xE0) { | 1475 if (byte1 < (byte) 0xE0) { |
1491 // Two-byte form | 1476 // Two-byte form |
1492 | 1477 |
1493 if (remaining == 0) { | 1478 if (remaining == 0) { |
1494 // Incomplete sequence | 1479 // Incomplete sequence |
1495 return byte1; | 1480 return byte1; |
1496 } | 1481 } |
1497 remaining--; | 1482 remaining--; |
1498 | 1483 |
1499 // Simultaneously checks for illegal trailing-byte in | 1484 // Simultaneously checks for illegal trailing-byte in |
1500 // leading position and overlong 2-byte form. | 1485 // leading position and overlong 2-byte form. |
1501 if (byte1 < (byte) 0xC2 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1486 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF
) { |
1502 return MALFORMED; | 1487 return MALFORMED; |
1503 } | 1488 } |
1504 } else if (byte1 < (byte) 0xF0) { | 1489 } else if (byte1 < (byte) 0xF0) { |
1505 // Three-byte form | 1490 // Three-byte form |
1506 | 1491 |
1507 if (remaining < 2) { | 1492 if (remaining < 2) { |
1508 // Incomplete sequence | 1493 // Incomplete sequence |
1509 return unsafeIncompleteStateFor(address, byte1, remaining); | 1494 return unsafeIncompleteStateFor(address, byte1, remaining); |
1510 } | 1495 } |
1511 remaining -= 2; | 1496 remaining -= 2; |
1512 | 1497 |
1513 final byte byte2 = UNSAFE.getByte(address++); | 1498 final byte byte2 = UnsafeUtil.getByte(address++); |
1514 if (byte2 > (byte) 0xBF | 1499 if (byte2 > (byte) 0xBF |
1515 // overlong? 5 most significant bits must not all be zero | 1500 // overlong? 5 most significant bits must not all be zero |
1516 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1501 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1517 // check for illegal surrogate codepoints | 1502 // check for illegal surrogate codepoints |
1518 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1503 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1519 // byte3 trailing-byte test | 1504 // byte3 trailing-byte test |
1520 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1505 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
1521 return MALFORMED; | 1506 return MALFORMED; |
1522 } | 1507 } |
1523 } else { | 1508 } else { |
1524 // Four-byte form | 1509 // Four-byte form |
1525 | 1510 |
1526 if (remaining < 3) { | 1511 if (remaining < 3) { |
1527 // Incomplete sequence | 1512 // Incomplete sequence |
1528 return unsafeIncompleteStateFor(address, byte1, remaining); | 1513 return unsafeIncompleteStateFor(address, byte1, remaining); |
1529 } | 1514 } |
1530 remaining -= 3; | 1515 remaining -= 3; |
1531 | 1516 |
1532 final byte byte2 = UNSAFE.getByte(address++); | 1517 final byte byte2 = UnsafeUtil.getByte(address++); |
1533 if (byte2 > (byte) 0xBF | 1518 if (byte2 > (byte) 0xBF |
1534 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1519 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1535 // if (byte1 > (byte) 0xF4 || | 1520 // if (byte1 > (byte) 0xF4 || |
1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1538 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1523 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1539 // byte3 trailing-byte test | 1524 // byte3 trailing-byte test |
1540 || UNSAFE.getByte(address++) > (byte) 0xBF | 1525 || UnsafeUtil.getByte(address++) > (byte) 0xBF |
1541 // byte4 trailing-byte test | 1526 // byte4 trailing-byte test |
1542 || UNSAFE.getByte(address++) > (byte) 0xBF) { | 1527 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { |
1543 return MALFORMED; | 1528 return MALFORMED; |
1544 } | 1529 } |
1545 } | 1530 } |
1546 } | 1531 } |
1547 } | 1532 } |
1548 | 1533 |
1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of
fset, | 1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of
fset, |
1550 int remaining) { | 1535 int remaining) { |
1551 switch (remaining) { | 1536 switch (remaining) { |
1552 case 0: { | 1537 case 0: { |
1553 return incompleteStateFor(byte1); | 1538 return incompleteStateFor(byte1); |
1554 } | 1539 } |
1555 case 1: { | 1540 case 1: { |
1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset)); | 1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); |
1557 } | 1542 } |
1558 case 2: { | 1543 case 2: { |
1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset), | 1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset), |
1560 UNSAFE.getByte(bytes, offset + 1)); | 1545 UnsafeUtil.getByte(bytes, offset + 1)); |
1561 } | 1546 } |
1562 default: { | 1547 default: { |
1563 throw new AssertionError(); | 1548 throw new AssertionError(); |
1564 } | 1549 } |
1565 } | 1550 } |
1566 } | 1551 } |
1567 | 1552 |
1568 private static int unsafeIncompleteStateFor(long address, final int byte1, i
nt remaining) { | 1553 private static int unsafeIncompleteStateFor(long address, final int byte1, i
nt remaining) { |
1569 switch (remaining) { | 1554 switch (remaining) { |
1570 case 0: { | 1555 case 0: { |
1571 return incompleteStateFor(byte1); | 1556 return incompleteStateFor(byte1); |
1572 } | 1557 } |
1573 case 1: { | 1558 case 1: { |
1574 return incompleteStateFor(byte1, UNSAFE.getByte(address)); | 1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); |
1575 } | 1560 } |
1576 case 2: { | 1561 case 2: { |
1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getBy
te(address + 1)); | 1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address), |
| 1563 UnsafeUtil.getByte(address + 1)); |
1578 } | 1564 } |
1579 default: { | 1565 default: { |
1580 throw new AssertionError(); | 1566 throw new AssertionError(); |
1581 } | 1567 } |
1582 } | 1568 } |
1583 } | 1569 } |
1584 | |
1585 /** | |
1586 * Gets the field with the given name within the class, or {@code null} if n
ot found. If | |
1587 * found, the field is made accessible. | |
1588 */ | |
1589 private static Field field(Class<?> clazz, String fieldName) { | |
1590 Field field; | |
1591 try { | |
1592 field = clazz.getDeclaredField(fieldName); | |
1593 field.setAccessible(true); | |
1594 } catch (Throwable t) { | |
1595 // Failed to access the fields. | |
1596 field = null; | |
1597 } | |
1598 logger.log(Level.FINEST, "{0}.{1}: {2}", | |
1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available"
: "unavailable")}); | |
1600 return field; | |
1601 } | |
1602 | |
1603 /** | |
1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.mis
c.Unsafe} is not | |
1605 * available. | |
1606 */ | |
1607 private static long fieldOffset(Field field) { | |
1608 return field == null || UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(fie
ld); | |
1609 } | |
1610 | |
1611 /** | |
1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Uns
afe} is not | |
1613 * available. | |
1614 */ | |
1615 private static <T> int byteArrayBaseOffset() { | |
1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class); | |
1617 } | |
1618 | |
1619 /** | |
1620 * Gets the offset of the {@code address} field of the given direct {@link B
yteBuffer}. | |
1621 */ | |
1622 private static long addressOffset(ByteBuffer buffer) { | |
1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET); | |
1624 } | |
1625 | |
1626 /** | |
1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not availab
le on this | |
1628 * platform. | |
1629 */ | |
1630 private static sun.misc.Unsafe getUnsafe() { | |
1631 sun.misc.Unsafe unsafe = null; | |
1632 try { | |
1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun
.misc.Unsafe>() { | |
1634 @Override | |
1635 public sun.misc.Unsafe run() throws Exception { | |
1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class; | |
1637 | |
1638 // Check that this platform supports all of the required unsafe meth
ods. | |
1639 checkRequiredMethods(k); | |
1640 | |
1641 for (Field f : k.getDeclaredFields()) { | |
1642 f.setAccessible(true); | |
1643 Object x = f.get(null); | |
1644 if (k.isInstance(x)) { | |
1645 return k.cast(x); | |
1646 } | |
1647 } | |
1648 // The sun.misc.Unsafe field does not exist. | |
1649 return null; | |
1650 } | |
1651 }); | |
1652 } catch (Throwable e) { | |
1653 // Catching Throwable here due to the fact that Google AppEngine raises
NoClassDefFoundError | |
1654 // for Unsafe. | |
1655 } | |
1656 | |
1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}", | |
1658 unsafe != null ? "available" : "unavailable"); | |
1659 return unsafe; | |
1660 } | |
1661 | |
1662 /** | |
1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are availab
le on this platform. | |
1664 */ | |
1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz) | |
1666 throws NoSuchMethodException, SecurityException { | |
1667 // Needed for Unsafe byte[] access | |
1668 clazz.getMethod("arrayBaseOffset", Class.class); | |
1669 clazz.getMethod("getByte", Object.class, long.class); | |
1670 clazz.getMethod("putByte", Object.class, long.class, byte.class); | |
1671 clazz.getMethod("getLong", Object.class, long.class); | |
1672 | |
1673 // Needed for Unsafe Direct ByteBuffer access | |
1674 clazz.getMethod("objectFieldOffset", Field.class); | |
1675 clazz.getMethod("getByte", long.class); | |
1676 clazz.getMethod("getLong", Object.class, long.class); | |
1677 clazz.getMethod("putByte", long.class, byte.class); | |
1678 clazz.getMethod("getLong", long.class); | |
1679 } | |
1680 } | 1570 } |
1681 | 1571 |
1682 private Utf8() {} | 1572 private Utf8() {} |
1683 } | 1573 } |
OLD | NEW |