OLD | NEW |
1 // Protocol Buffers - Google's data interchange format | 1 // Protocol Buffers - Google's data interchange format |
2 // Copyright 2008 Google Inc. All rights reserved. | 2 // Copyright 2008 Google Inc. All rights reserved. |
3 // https://developers.google.com/protocol-buffers/ | 3 // https://developers.google.com/protocol-buffers/ |
4 // | 4 // |
5 // Redistribution and use in source and binary forms, with or without | 5 // Redistribution and use in source and binary forms, with or without |
6 // modification, are permitted provided that the following conditions are | 6 // modification, are permitted provided that the following conditions are |
7 // met: | 7 // met: |
8 // | 8 // |
9 // * Redistributions of source code must retain the above copyright | 9 // * Redistributions of source code must retain the above copyright |
10 // notice, this list of conditions and the following disclaimer. | 10 // notice, this list of conditions and the following disclaimer. |
(...skipping 12 matching lines...) Expand all Loading... |
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | 30 |
31 package com.google.protobuf; | 31 package com.google.protobuf; |
32 | 32 |
33 import static com.google.protobuf.UnsafeUtil.addressOffset; | |
34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset; | |
35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations; | |
36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations; | |
37 import static java.lang.Character.MAX_SURROGATE; | 33 import static java.lang.Character.MAX_SURROGATE; |
38 import static java.lang.Character.MIN_SURROGATE; | 34 import static java.lang.Character.MIN_SURROGATE; |
39 import static java.lang.Character.isSurrogatePair; | 35 import static java.lang.Character.isSurrogatePair; |
40 import static java.lang.Character.toCodePoint; | 36 import static java.lang.Character.toCodePoint; |
41 | 37 |
| 38 import java.lang.reflect.Field; |
| 39 import java.nio.Buffer; |
42 import java.nio.ByteBuffer; | 40 import java.nio.ByteBuffer; |
| 41 import java.security.AccessController; |
| 42 import java.security.PrivilegedExceptionAction; |
| 43 import java.util.logging.Level; |
| 44 import java.util.logging.Logger; |
43 | 45 |
44 /** | 46 /** |
45 * A set of low-level, high-performance static utility methods related | 47 * A set of low-level, high-performance static utility methods related |
46 * to the UTF-8 character encoding. This class has no dependencies | 48 * to the UTF-8 character encoding. This class has no dependencies |
47 * outside of the core JDK libraries. | 49 * outside of the core JDK libraries. |
48 * | 50 * |
49 * <p>There are several variants of UTF-8. The one implemented by | 51 * <p>There are several variants of UTF-8. The one implemented by |
50 * this class is the restricted definition of UTF-8 introduced in | 52 * this class is the restricted definition of UTF-8 introduced in |
51 * Unicode 3.1, which mandates the rejection of "overlong" byte | 53 * Unicode 3.1, which mandates the rejection of "overlong" byte |
52 * sequences as well as rejection of 3-byte surrogate codepoint byte | 54 * sequences as well as rejection of 3-byte surrogate codepoint byte |
(...skipping 17 matching lines...) Expand all Loading... |
70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is | 72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is |
71 * well-formed in the absence of additional input, or if the byte sequence | 73 * well-formed in the absence of additional input, or if the byte sequence |
72 * apparently terminated in the middle of a character, an opaque integer | 74 * apparently terminated in the middle of a character, an opaque integer |
73 * "state" value containing enough information to decode the character when | 75 * "state" value containing enough information to decode the character when |
74 * passed to a subsequent invocation of a partial decoding method. | 76 * passed to a subsequent invocation of a partial decoding method. |
75 * | 77 * |
76 * @author martinrb@google.com (Martin Buchholz) | 78 * @author martinrb@google.com (Martin Buchholz) |
77 */ | 79 */ |
78 // TODO(nathanmittler): Copy changes in this class back to Guava | 80 // TODO(nathanmittler): Copy changes in this class back to Guava |
79 final class Utf8 { | 81 final class Utf8 { |
| 82 private static final Logger logger = Logger.getLogger(Utf8.class.getName()); |
80 | 83 |
81 /** | 84 /** |
82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl
ementations | 85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized impl
ementations |
83 * depending on what is available on the platform. The processor is the platfo
rm-optimized | 86 * depending on what is available on the platform. The processor is the platfo
rm-optimized |
84 * delegate for which all methods are delegated directly to. | 87 * delegate for which all methods are delegated directly to. |
85 */ | 88 */ |
86 private static final Processor processor = | 89 private static final Processor processor = |
87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(
); | 90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(
); |
88 | 91 |
89 /** | 92 /** |
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
227 default: | 230 default: |
228 throw new AssertionError(); | 231 throw new AssertionError(); |
229 } | 232 } |
230 } | 233 } |
231 | 234 |
232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi
fication to throw | 235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modi
fication to throw |
233 // a protocol buffer local exception. This exception is then caught in CodedOu
tputStream so it can | 236 // a protocol buffer local exception. This exception is then caught in CodedOu
tputStream so it can |
234 // fallback to more lenient behavior. | 237 // fallback to more lenient behavior. |
235 | 238 |
236 static class UnpairedSurrogateException extends IllegalArgumentException { | 239 static class UnpairedSurrogateException extends IllegalArgumentException { |
237 UnpairedSurrogateException(int index, int length) { | 240 private UnpairedSurrogateException(int index, int length) { |
238 super("Unpaired surrogate at index " + index + " of " + length); | 241 super("Unpaired surrogate at index " + index + " of " + length); |
239 } | 242 } |
240 } | 243 } |
241 | 244 |
242 /** | 245 /** |
243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
For a string, | 246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
For a string, |
244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is
more efficient in | 247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is
more efficient in |
245 * both time and space. | 248 * both time and space. |
246 * | 249 * |
247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT
F-16 (unpaired | 250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UT
F-16 (unpaired |
(...skipping 733 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
981 } | 984 } |
982 } | 985 } |
983 } | 986 } |
984 } | 987 } |
985 } | 988 } |
986 | 989 |
987 /** | 990 /** |
988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro
ve performance. | 991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to impro
ve performance. |
989 */ | 992 */ |
990 static final class UnsafeProcessor extends Processor { | 993 static final class UnsafeProcessor extends Processor { |
| 994 private static final sun.misc.Unsafe UNSAFE = getUnsafe(); |
| 995 private static final long BUFFER_ADDRESS_OFFSET = |
| 996 fieldOffset(field(Buffer.class, "address")); |
| 997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset(); |
| 998 |
| 999 /** |
| 1000 * We only use Unsafe operations if we have access to direct {@link ByteBuff
er}'s address |
| 1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong())
. |
| 1002 */ |
| 1003 private static final boolean AVAILABLE = |
| 1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0; |
| 1005 |
991 /** | 1006 /** |
992 * Indicates whether or not all required unsafe operations are supported on
this platform. | 1007 * Indicates whether or not all required unsafe operations are supported on
this platform. |
993 */ | 1008 */ |
994 static boolean isAvailable() { | 1009 static boolean isAvailable() { |
995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); | 1010 return AVAILABLE; |
996 } | 1011 } |
997 | 1012 |
998 @Override | 1013 @Override |
999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l
imit) { | 1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int l
imit) { |
1000 if ((index | limit | bytes.length - limit) < 0) { | 1015 if ((index | limit | bytes.length - limit) < 0) { |
1001 throw new ArrayIndexOutOfBoundsException( | 1016 throw new ArrayIndexOutOfBoundsException( |
1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i
ndex, limit)); | 1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, i
ndex, limit)); |
1003 } | 1018 } |
1004 long offset = getArrayBaseOffset() + index; | 1019 long offset = ARRAY_BASE_OFFSET + index; |
1005 final long offsetLimit = getArrayBaseOffset() + limit; | 1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit; |
1006 if (state != COMPLETE) { | 1021 if (state != COMPLETE) { |
1007 // The previous decoding operation was incomplete (or malformed). | 1022 // The previous decoding operation was incomplete (or malformed). |
1008 // We look for a well-formed sequence consisting of bytes from | 1023 // We look for a well-formed sequence consisting of bytes from |
1009 // the previous decoding operation (stored in state) together | 1024 // the previous decoding operation (stored in state) together |
1010 // with bytes from the array slice. | 1025 // with bytes from the array slice. |
1011 // | 1026 // |
1012 // We expect such "straddler characters" to be rare. | 1027 // We expect such "straddler characters" to be rare. |
1013 | 1028 |
1014 if (offset >= offsetLimit) { // No bytes? No progress. | 1029 if (offset >= offsetLimit) { // No bytes? No progress. |
1015 return state; | 1030 return state; |
1016 } | 1031 } |
1017 int byte1 = (byte) state; | 1032 int byte1 = (byte) state; |
1018 // byte1 is never ASCII. | 1033 // byte1 is never ASCII. |
1019 if (byte1 < (byte) 0xE0) { | 1034 if (byte1 < (byte) 0xE0) { |
1020 // two-byte form | 1035 // two-byte form |
1021 | 1036 |
1022 // Simultaneously checks for illegal trailing-byte in | 1037 // Simultaneously checks for illegal trailing-byte in |
1023 // leading position and overlong 2-byte form. | 1038 // leading position and overlong 2-byte form. |
1024 if (byte1 < (byte) 0xC2 | 1039 if (byte1 < (byte) 0xC2 |
1025 // byte2 trailing-byte test | 1040 // byte2 trailing-byte test |
1026 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { | 1041 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { |
1027 return MALFORMED; | 1042 return MALFORMED; |
1028 } | 1043 } |
1029 } else if (byte1 < (byte) 0xF0) { | 1044 } else if (byte1 < (byte) 0xF0) { |
1030 // three-byte form | 1045 // three-byte form |
1031 | 1046 |
1032 // Get byte2 from saved state or array | 1047 // Get byte2 from saved state or array |
1033 int byte2 = (byte) ~(state >> 8); | 1048 int byte2 = (byte) ~(state >> 8); |
1034 if (byte2 == 0) { | 1049 if (byte2 == 0) { |
1035 byte2 = UnsafeUtil.getByte(bytes, offset++); | 1050 byte2 = UNSAFE.getByte(bytes, offset++); |
1036 if (offset >= offsetLimit) { | 1051 if (offset >= offsetLimit) { |
1037 return incompleteStateFor(byte1, byte2); | 1052 return incompleteStateFor(byte1, byte2); |
1038 } | 1053 } |
1039 } | 1054 } |
1040 if (byte2 > (byte) 0xBF | 1055 if (byte2 > (byte) 0xBF |
1041 // overlong? 5 most significant bits must not all be zero | 1056 // overlong? 5 most significant bits must not all be zero |
1042 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1057 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1043 // illegal surrogate codepoint? | 1058 // illegal surrogate codepoint? |
1044 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1059 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1045 // byte3 trailing-byte test | 1060 // byte3 trailing-byte test |
1046 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { | 1061 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { |
1047 return MALFORMED; | 1062 return MALFORMED; |
1048 } | 1063 } |
1049 } else { | 1064 } else { |
1050 // four-byte form | 1065 // four-byte form |
1051 | 1066 |
1052 // Get byte2 and byte3 from saved state or array | 1067 // Get byte2 and byte3 from saved state or array |
1053 int byte2 = (byte) ~(state >> 8); | 1068 int byte2 = (byte) ~(state >> 8); |
1054 int byte3 = 0; | 1069 int byte3 = 0; |
1055 if (byte2 == 0) { | 1070 if (byte2 == 0) { |
1056 byte2 = UnsafeUtil.getByte(bytes, offset++); | 1071 byte2 = UNSAFE.getByte(bytes, offset++); |
1057 if (offset >= offsetLimit) { | 1072 if (offset >= offsetLimit) { |
1058 return incompleteStateFor(byte1, byte2); | 1073 return incompleteStateFor(byte1, byte2); |
1059 } | 1074 } |
1060 } else { | 1075 } else { |
1061 byte3 = (byte) (state >> 16); | 1076 byte3 = (byte) (state >> 16); |
1062 } | 1077 } |
1063 if (byte3 == 0) { | 1078 if (byte3 == 0) { |
1064 byte3 = UnsafeUtil.getByte(bytes, offset++); | 1079 byte3 = UNSAFE.getByte(bytes, offset++); |
1065 if (offset >= offsetLimit) { | 1080 if (offset >= offsetLimit) { |
1066 return incompleteStateFor(byte1, byte2, byte3); | 1081 return incompleteStateFor(byte1, byte2, byte3); |
1067 } | 1082 } |
1068 } | 1083 } |
1069 | 1084 |
1070 // If we were called with state == MALFORMED, then byte1 is 0xFF, | 1085 // If we were called with state == MALFORMED, then byte1 is 0xFF, |
1071 // which never occurs in well-formed UTF-8, and so we will return | 1086 // which never occurs in well-formed UTF-8, and so we will return |
1072 // MALFORMED again below. | 1087 // MALFORMED again below. |
1073 | 1088 |
1074 if (byte2 > (byte) 0xBF | 1089 if (byte2 > (byte) 0xBF |
1075 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1090 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1076 // if (byte1 > (byte) 0xF4 || | 1091 // if (byte1 > (byte) 0xF4 || |
1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1079 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1094 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1080 // byte3 trailing-byte test | 1095 // byte3 trailing-byte test |
1081 || byte3 > (byte) 0xBF | 1096 || byte3 > (byte) 0xBF |
1082 // byte4 trailing-byte test | 1097 // byte4 trailing-byte test |
1083 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { | 1098 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { |
1084 return MALFORMED; | 1099 return MALFORMED; |
1085 } | 1100 } |
1086 } | 1101 } |
1087 } | 1102 } |
1088 | 1103 |
1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); | 1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); |
1090 } | 1105 } |
1091 | 1106 |
1092 @Override | 1107 @Override |
1093 int partialIsValidUtf8Direct( | 1108 int partialIsValidUtf8Direct( |
(...skipping 18 matching lines...) Expand all Loading... |
1112 | 1127 |
1113 final int byte1 = (byte) state; | 1128 final int byte1 = (byte) state; |
1114 // byte1 is never ASCII. | 1129 // byte1 is never ASCII. |
1115 if (byte1 < (byte) 0xE0) { | 1130 if (byte1 < (byte) 0xE0) { |
1116 // two-byte form | 1131 // two-byte form |
1117 | 1132 |
1118 // Simultaneously checks for illegal trailing-byte in | 1133 // Simultaneously checks for illegal trailing-byte in |
1119 // leading position and overlong 2-byte form. | 1134 // leading position and overlong 2-byte form. |
1120 if (byte1 < (byte) 0xC2 | 1135 if (byte1 < (byte) 0xC2 |
1121 // byte2 trailing-byte test | 1136 // byte2 trailing-byte test |
1122 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { | 1137 || UNSAFE.getByte(address++) > (byte) 0xBF) { |
1123 return MALFORMED; | 1138 return MALFORMED; |
1124 } | 1139 } |
1125 } else if (byte1 < (byte) 0xF0) { | 1140 } else if (byte1 < (byte) 0xF0) { |
1126 // three-byte form | 1141 // three-byte form |
1127 | 1142 |
1128 // Get byte2 from saved state or array | 1143 // Get byte2 from saved state or array |
1129 int byte2 = (byte) ~(state >> 8); | 1144 int byte2 = (byte) ~(state >> 8); |
1130 if (byte2 == 0) { | 1145 if (byte2 == 0) { |
1131 byte2 = UnsafeUtil.getByte(address++); | 1146 byte2 = UNSAFE.getByte(address++); |
1132 if (address >= addressLimit) { | 1147 if (address >= addressLimit) { |
1133 return incompleteStateFor(byte1, byte2); | 1148 return incompleteStateFor(byte1, byte2); |
1134 } | 1149 } |
1135 } | 1150 } |
1136 if (byte2 > (byte) 0xBF | 1151 if (byte2 > (byte) 0xBF |
1137 // overlong? 5 most significant bits must not all be zero | 1152 // overlong? 5 most significant bits must not all be zero |
1138 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1153 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1139 // illegal surrogate codepoint? | 1154 // illegal surrogate codepoint? |
1140 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1155 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1141 // byte3 trailing-byte test | 1156 // byte3 trailing-byte test |
1142 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { | 1157 || UNSAFE.getByte(address++) > (byte) 0xBF) { |
1143 return MALFORMED; | 1158 return MALFORMED; |
1144 } | 1159 } |
1145 } else { | 1160 } else { |
1146 // four-byte form | 1161 // four-byte form |
1147 | 1162 |
1148 // Get byte2 and byte3 from saved state or array | 1163 // Get byte2 and byte3 from saved state or array |
1149 int byte2 = (byte) ~(state >> 8); | 1164 int byte2 = (byte) ~(state >> 8); |
1150 int byte3 = 0; | 1165 int byte3 = 0; |
1151 if (byte2 == 0) { | 1166 if (byte2 == 0) { |
1152 byte2 = UnsafeUtil.getByte(address++); | 1167 byte2 = UNSAFE.getByte(address++); |
1153 if (address >= addressLimit) { | 1168 if (address >= addressLimit) { |
1154 return incompleteStateFor(byte1, byte2); | 1169 return incompleteStateFor(byte1, byte2); |
1155 } | 1170 } |
1156 } else { | 1171 } else { |
1157 byte3 = (byte) (state >> 16); | 1172 byte3 = (byte) (state >> 16); |
1158 } | 1173 } |
1159 if (byte3 == 0) { | 1174 if (byte3 == 0) { |
1160 byte3 = UnsafeUtil.getByte(address++); | 1175 byte3 = UNSAFE.getByte(address++); |
1161 if (address >= addressLimit) { | 1176 if (address >= addressLimit) { |
1162 return incompleteStateFor(byte1, byte2, byte3); | 1177 return incompleteStateFor(byte1, byte2, byte3); |
1163 } | 1178 } |
1164 } | 1179 } |
1165 | 1180 |
1166 // If we were called with state == MALFORMED, then byte1 is 0xFF, | 1181 // If we were called with state == MALFORMED, then byte1 is 0xFF, |
1167 // which never occurs in well-formed UTF-8, and so we will return | 1182 // which never occurs in well-formed UTF-8, and so we will return |
1168 // MALFORMED again below. | 1183 // MALFORMED again below. |
1169 | 1184 |
1170 if (byte2 > (byte) 0xBF | 1185 if (byte2 > (byte) 0xBF |
1171 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1186 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1172 // if (byte1 > (byte) 0xF4 || | 1187 // if (byte1 > (byte) 0xF4 || |
1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1175 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1190 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1176 // byte3 trailing-byte test | 1191 // byte3 trailing-byte test |
1177 || byte3 > (byte) 0xBF | 1192 || byte3 > (byte) 0xBF |
1178 // byte4 trailing-byte test | 1193 // byte4 trailing-byte test |
1179 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { | 1194 || UNSAFE.getByte(address++) > (byte) 0xBF) { |
1180 return MALFORMED; | 1195 return MALFORMED; |
1181 } | 1196 } |
1182 } | 1197 } |
1183 } | 1198 } |
1184 | 1199 |
1185 return partialIsValidUtf8(address, (int) (addressLimit - address)); | 1200 return partialIsValidUtf8(address, (int) (addressLimit - address)); |
1186 } | 1201 } |
1187 | 1202 |
1188 @Override | 1203 @Override |
1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi
nal int length) { | 1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, fi
nal int length) { |
1190 long outIx = getArrayBaseOffset() + offset; | 1205 long outIx = ARRAY_BASE_OFFSET + offset; |
1191 final long outLimit = outIx + length; | 1206 final long outLimit = outIx + length; |
1192 final int inLimit = in.length(); | 1207 final int inLimit = in.length(); |
1193 if (inLimit > length || out.length - length < offset) { | 1208 if (inLimit > length || out.length - length < offset) { |
1194 // Not even enough room for an ASCII-encoded string. | 1209 // Not even enough room for an ASCII-encoded string. |
1195 throw new ArrayIndexOutOfBoundsException( | 1210 throw new ArrayIndexOutOfBoundsException( |
1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset
+ length)); | 1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset
+ length)); |
1197 } | 1212 } |
1198 | 1213 |
1199 // Designed to take advantage of | 1214 // Designed to take advantage of |
1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination | 1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination |
1201 int inIx = 0; | 1216 int inIx = 0; |
1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { | 1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
1203 UnsafeUtil.putByte(out, outIx++, (byte) c); | 1218 UNSAFE.putByte(out, outIx++, (byte) c); |
1204 } | 1219 } |
1205 if (inIx == inLimit) { | 1220 if (inIx == inLimit) { |
1206 // We're done, it was ASCII encoded. | 1221 // We're done, it was ASCII encoded. |
1207 return (int) (outIx - getArrayBaseOffset()); | 1222 return (int) (outIx - ARRAY_BASE_OFFSET); |
1208 } | 1223 } |
1209 | 1224 |
1210 for (char c; inIx < inLimit; ++inIx) { | 1225 for (char c; inIx < inLimit; ++inIx) { |
1211 c = in.charAt(inIx); | 1226 c = in.charAt(inIx); |
1212 if (c < 0x80 && outIx < outLimit) { | 1227 if (c < 0x80 && outIx < outLimit) { |
1213 UnsafeUtil.putByte(out, outIx++, (byte) c); | 1228 UNSAFE.putByte(out, outIx++, (byte) c); |
1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes | 1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes |
1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); | 1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); |
1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); | 1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); |
1217 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { | 1232 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { |
1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s | 1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s |
1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); | 1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); |
1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); | 1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); | 1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); |
1222 } else if (outIx <= outLimit - 4L) { | 1237 } else if (outIx <= outLimit - 4L) { |
1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 | 1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 |
1224 // bytes | 1239 // bytes |
1225 final char low; | 1240 final char low; |
1226 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { | 1241 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { |
1227 throw new UnpairedSurrogateException((inIx - 1), inLimit); | 1242 throw new UnpairedSurrogateException((inIx - 1), inLimit); |
1228 } | 1243 } |
1229 int codePoint = toCodePoint(c, low); | 1244 int codePoint = toCodePoint(c, low); |
1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 1
8))); | 1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)))
; |
1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>>
12)))); | 1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)
))); |
1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>>
6)))); | 1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))
)); |
1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); | 1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); |
1234 } else { | 1249 } else { |
1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) | 1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) |
1236 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { | 1251 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { |
1237 // We are surrogates and we're not a surrogate pair. | 1252 // We are surrogates and we're not a surrogate pair. |
1238 throw new UnpairedSurrogateException(inIx, inLimit); | 1253 throw new UnpairedSurrogateException(inIx, inLimit); |
1239 } | 1254 } |
1240 // Not enough space in the output buffer. | 1255 // Not enough space in the output buffer. |
1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); | 1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); |
1242 } | 1257 } |
1243 } | 1258 } |
1244 | 1259 |
1245 // All bytes have been encoded. | 1260 // All bytes have been encoded. |
1246 return (int) (outIx - getArrayBaseOffset()); | 1261 return (int) (outIx - ARRAY_BASE_OFFSET); |
1247 } | 1262 } |
1248 | 1263 |
1249 @Override | 1264 @Override |
1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { | 1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { |
1251 final long address = addressOffset(out); | 1266 final long address = addressOffset(out); |
1252 long outIx = address + out.position(); | 1267 long outIx = address + out.position(); |
1253 final long outLimit = address + out.limit(); | 1268 final long outLimit = address + out.limit(); |
1254 final int inLimit = in.length(); | 1269 final int inLimit = in.length(); |
1255 if (inLimit > outLimit - outIx) { | 1270 if (inLimit > outLimit - outIx) { |
1256 // Not even enough room for an ASCII-encoded string. | 1271 // Not even enough room for an ASCII-encoded string. |
1257 throw new ArrayIndexOutOfBoundsException( | 1272 throw new ArrayIndexOutOfBoundsException( |
1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi
t()); | 1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limi
t()); |
1259 } | 1274 } |
1260 | 1275 |
1261 // Designed to take advantage of | 1276 // Designed to take advantage of |
1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination | 1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination |
1263 int inIx = 0; | 1278 int inIx = 0; |
1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { | 1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
1265 UnsafeUtil.putByte(outIx++, (byte) c); | 1280 UNSAFE.putByte(outIx++, (byte) c); |
1266 } | 1281 } |
1267 if (inIx == inLimit) { | 1282 if (inIx == inLimit) { |
1268 // We're done, it was ASCII encoded. | 1283 // We're done, it was ASCII encoded. |
1269 out.position((int) (outIx - address)); | 1284 out.position((int) (outIx - address)); |
1270 return; | 1285 return; |
1271 } | 1286 } |
1272 | 1287 |
1273 for (char c; inIx < inLimit; ++inIx) { | 1288 for (char c; inIx < inLimit; ++inIx) { |
1274 c = in.charAt(inIx); | 1289 c = in.charAt(inIx); |
1275 if (c < 0x80 && outIx < outLimit) { | 1290 if (c < 0x80 && outIx < outLimit) { |
1276 UnsafeUtil.putByte(outIx++, (byte) c); | 1291 UNSAFE.putByte(outIx++, (byte) c); |
1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes | 1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8
bytes |
1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); | 1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); |
1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); | 1294 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
1280 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { | 1295 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit
- 3L) { |
1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s | 1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 byte
s |
1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); | 1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); |
1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); | 1298 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); | 1299 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
1285 } else if (outIx <= outLimit - 4L) { | 1300 } else if (outIx <= outLimit - 4L) { |
1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 | 1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 b
its, four UTF-8 |
1287 // bytes | 1302 // bytes |
1288 final char low; | 1303 final char low; |
1289 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { | 1304 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx
)))) { |
1290 throw new UnpairedSurrogateException((inIx - 1), inLimit); | 1305 throw new UnpairedSurrogateException((inIx - 1), inLimit); |
1291 } | 1306 } |
1292 int codePoint = toCodePoint(c, low); | 1307 int codePoint = toCodePoint(c, low); |
1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); | 1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); |
1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))
)); | 1309 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); |
1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))
); | 1310 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); |
1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); | 1311 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); |
1297 } else { | 1312 } else { |
1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) | 1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) |
1299 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { | 1314 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)
))) { |
1300 // We are surrogates and we're not a surrogate pair. | 1315 // We are surrogates and we're not a surrogate pair. |
1301 throw new UnpairedSurrogateException(inIx, inLimit); | 1316 throw new UnpairedSurrogateException(inIx, inLimit); |
1302 } | 1317 } |
1303 // Not enough space in the output buffer. | 1318 // Not enough space in the output buffer. |
1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); | 1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at
index " + outIx); |
1305 } | 1320 } |
1306 } | 1321 } |
(...skipping 20 matching lines...) Expand all Loading... |
1327 return 0; | 1342 return 0; |
1328 } | 1343 } |
1329 | 1344 |
1330 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. | 1345 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. |
1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to
make sure that | 1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to
make sure that |
1332 // the index (relative to the start of the array) is also 8-byte aligned.
We do this by | 1347 // the index (relative to the start of the array) is also 8-byte aligned.
We do this by |
1333 // ANDing the index with 7 to determine the number of bytes that need to b
e read before | 1348 // ANDing the index with 7 to determine the number of bytes that need to b
e read before |
1334 // we're 8-byte aligned. | 1349 // we're 8-byte aligned. |
1335 final int unaligned = (int) offset & 7; | 1350 final int unaligned = (int) offset & 7; |
1336 for (int j = unaligned; j > 0; j--) { | 1351 for (int j = unaligned; j > 0; j--) { |
1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) { | 1352 if (UNSAFE.getByte(bytes, offset++) < 0) { |
1338 return unaligned - j; | 1353 return unaligned - j; |
1339 } | 1354 } |
1340 } | 1355 } |
1341 | 1356 |
1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). | 1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). |
1343 // To speed things up further, we're reading longs instead of bytes so we
use a mask to | 1358 // To speed things up further, we're reading longs instead of bytes so we
use a mask to |
1344 // determine if any byte in the current long is non-ASCII. | 1359 // determine if any byte in the current long is non-ASCII. |
1345 remaining -= unaligned; | 1360 remaining -= unaligned; |
1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_L
ONG) == 0; | 1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG)
== 0; |
1347 offset += 8, remaining -= 8) {} | 1362 offset += 8, remaining -= 8) {} |
1348 return maxChars - remaining; | 1363 return maxChars - remaining; |
1349 } | 1364 } |
1350 | 1365 |
1351 /** | 1366 /** |
1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep
t that it uses the | 1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} excep
t that it uses the |
1353 * most efficient method available to the platform. | 1368 * most efficient method available to the platform. |
1354 */ | 1369 */ |
1355 private static int unsafeEstimateConsecutiveAscii(long address, final int ma
xChars) { | 1370 private static int unsafeEstimateConsecutiveAscii(long address, final int ma
xChars) { |
1356 int remaining = maxChars; | 1371 int remaining = maxChars; |
1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { | 1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { |
1358 // Don't bother with small strings. | 1373 // Don't bother with small strings. |
1359 return 0; | 1374 return 0; |
1360 } | 1375 } |
1361 | 1376 |
1362 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. | 1377 // Read bytes until 8-byte aligned so that we can read longs in the loop b
elow. |
1363 // We do this by ANDing the address with 7 to determine the number of byte
s that need to | 1378 // We do this by ANDing the address with 7 to determine the number of byte
s that need to |
1364 // be read before we're 8-byte aligned. | 1379 // be read before we're 8-byte aligned. |
1365 final int unaligned = (int) address & 7; | 1380 final int unaligned = (int) address & 7; |
1366 for (int j = unaligned; j > 0; j--) { | 1381 for (int j = unaligned; j > 0; j--) { |
1367 if (UnsafeUtil.getByte(address++) < 0) { | 1382 if (UNSAFE.getByte(address++) < 0) { |
1368 return unaligned - j; | 1383 return unaligned - j; |
1369 } | 1384 } |
1370 } | 1385 } |
1371 | 1386 |
1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). | 1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII
). |
1373 // To speed things up further, we're reading longs instead of bytes so we
use a mask to | 1388 // To speed things up further, we're reading longs instead of bytes so we
use a mask to |
1374 // determine if any byte in the current long is non-ASCII. | 1389 // determine if any byte in the current long is non-ASCII. |
1375 remaining -= unaligned; | 1390 remaining -= unaligned; |
1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) =
= 0; | 1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0; |
1377 address += 8, remaining -= 8) {} | 1392 address += 8, remaining -= 8) {} |
1378 return maxChars - remaining; | 1393 return maxChars - remaining; |
1379 } | 1394 } |
1380 | 1395 |
1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r
emaining) { | 1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int r
emaining) { |
1382 // Skip past ASCII characters as quickly as possible. | 1397 // Skip past ASCII characters as quickly as possible. |
1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin
g); | 1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remainin
g); |
1384 remaining -= skipped; | 1399 remaining -= skipped; |
1385 offset += skipped; | 1400 offset += skipped; |
1386 | 1401 |
1387 for (;;) { | 1402 for (;;) { |
1388 // Optimize for interior runs of ASCII bytes. | 1403 // Optimize for interior runs of ASCII bytes. |
1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? | 1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? |
1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? | 1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? |
1391 int byte1 = 0; | 1406 int byte1 = 0; |
1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >=
0; --remaining) { | 1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0;
--remaining) { |
1393 } | 1408 } |
1394 if (remaining == 0) { | 1409 if (remaining == 0) { |
1395 return COMPLETE; | 1410 return COMPLETE; |
1396 } | 1411 } |
1397 remaining--; | 1412 remaining--; |
1398 | 1413 |
1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. | 1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. |
1400 if (byte1 < (byte) 0xE0) { | 1415 if (byte1 < (byte) 0xE0) { |
1401 // Two-byte form (110xxxxx 10xxxxxx) | 1416 // Two-byte form (110xxxxx 10xxxxxx) |
1402 if (remaining == 0) { | 1417 if (remaining == 0) { |
1403 // Incomplete sequence | 1418 // Incomplete sequence |
1404 return byte1; | 1419 return byte1; |
1405 } | 1420 } |
1406 remaining--; | 1421 remaining--; |
1407 | 1422 |
1408 // Simultaneously checks for illegal trailing-byte in | 1423 // Simultaneously checks for illegal trailing-byte in |
1409 // leading position and overlong 2-byte form. | 1424 // leading position and overlong 2-byte form. |
1410 if (byte1 < (byte) 0xC2 | 1425 if (byte1 < (byte) 0xC2 |
1411 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { | 1426 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { |
1412 return MALFORMED; | 1427 return MALFORMED; |
1413 } | 1428 } |
1414 } else if (byte1 < (byte) 0xF0) { | 1429 } else if (byte1 < (byte) 0xF0) { |
1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) | 1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) |
1416 if (remaining < 2) { | 1431 if (remaining < 2) { |
1417 // Incomplete sequence | 1432 // Incomplete sequence |
1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); | 1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); |
1419 } | 1434 } |
1420 remaining -= 2; | 1435 remaining -= 2; |
1421 | 1436 |
1422 final int byte2; | 1437 final int byte2; |
1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF | 1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF |
1424 // overlong? 5 most significant bits must not all be zero | 1439 // overlong? 5 most significant bits must not all be zero |
1425 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1440 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1426 // check for illegal surrogate codepoints | 1441 // check for illegal surrogate codepoints |
1427 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1442 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1428 // byte3 trailing-byte test | 1443 // byte3 trailing-byte test |
1429 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { | 1444 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { |
1430 return MALFORMED; | 1445 return MALFORMED; |
1431 } | 1446 } |
1432 } else { | 1447 } else { |
1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) | 1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) |
1434 if (remaining < 3) { | 1449 if (remaining < 3) { |
1435 // Incomplete sequence | 1450 // Incomplete sequence |
1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); | 1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); |
1437 } | 1452 } |
1438 remaining -= 3; | 1453 remaining -= 3; |
1439 | 1454 |
1440 final int byte2; | 1455 final int byte2; |
1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF | 1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF |
1442 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1457 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1443 // if (byte1 > (byte) 0xF4 || | 1458 // if (byte1 > (byte) 0xF4 || |
1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1446 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1461 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1447 // byte3 trailing-byte test | 1462 // byte3 trailing-byte test |
1448 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF | 1463 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF |
1449 // byte4 trailing-byte test | 1464 // byte4 trailing-byte test |
1450 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { | 1465 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { |
1451 return MALFORMED; | 1466 return MALFORMED; |
1452 } | 1467 } |
1453 } | 1468 } |
1454 } | 1469 } |
1455 } | 1470 } |
1456 | 1471 |
1457 private static int partialIsValidUtf8(long address, int remaining) { | 1472 private static int partialIsValidUtf8(long address, int remaining) { |
1458 // Skip past ASCII characters as quickly as possible. | 1473 // Skip past ASCII characters as quickly as possible. |
1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); | 1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); |
1460 address += skipped; | 1475 address += skipped; |
1461 remaining -= skipped; | 1476 remaining -= skipped; |
1462 | 1477 |
1463 for (;;) { | 1478 for (;;) { |
1464 // Optimize for interior runs of ASCII bytes. | 1479 // Optimize for interior runs of ASCII bytes. |
1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? | 1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some t
hreshold? |
1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? | 1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mod
e? |
1467 int byte1 = 0; | 1482 int byte1 = 0; |
1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --
remaining) { | 1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --rema
ining) { |
1469 } | 1484 } |
1470 if (remaining == 0) { | 1485 if (remaining == 0) { |
1471 return COMPLETE; | 1486 return COMPLETE; |
1472 } | 1487 } |
1473 remaining--; | 1488 remaining--; |
1474 | 1489 |
1475 if (byte1 < (byte) 0xE0) { | 1490 if (byte1 < (byte) 0xE0) { |
1476 // Two-byte form | 1491 // Two-byte form |
1477 | 1492 |
1478 if (remaining == 0) { | 1493 if (remaining == 0) { |
1479 // Incomplete sequence | 1494 // Incomplete sequence |
1480 return byte1; | 1495 return byte1; |
1481 } | 1496 } |
1482 remaining--; | 1497 remaining--; |
1483 | 1498 |
1484 // Simultaneously checks for illegal trailing-byte in | 1499 // Simultaneously checks for illegal trailing-byte in |
1485 // leading position and overlong 2-byte form. | 1500 // leading position and overlong 2-byte form. |
1486 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF
) { | 1501 if (byte1 < (byte) 0xC2 || UNSAFE.getByte(address++) > (byte) 0xBF) { |
1487 return MALFORMED; | 1502 return MALFORMED; |
1488 } | 1503 } |
1489 } else if (byte1 < (byte) 0xF0) { | 1504 } else if (byte1 < (byte) 0xF0) { |
1490 // Three-byte form | 1505 // Three-byte form |
1491 | 1506 |
1492 if (remaining < 2) { | 1507 if (remaining < 2) { |
1493 // Incomplete sequence | 1508 // Incomplete sequence |
1494 return unsafeIncompleteStateFor(address, byte1, remaining); | 1509 return unsafeIncompleteStateFor(address, byte1, remaining); |
1495 } | 1510 } |
1496 remaining -= 2; | 1511 remaining -= 2; |
1497 | 1512 |
1498 final byte byte2 = UnsafeUtil.getByte(address++); | 1513 final byte byte2 = UNSAFE.getByte(address++); |
1499 if (byte2 > (byte) 0xBF | 1514 if (byte2 > (byte) 0xBF |
1500 // overlong? 5 most significant bits must not all be zero | 1515 // overlong? 5 most significant bits must not all be zero |
1501 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) | 1516 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) |
1502 // check for illegal surrogate codepoints | 1517 // check for illegal surrogate codepoints |
1503 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) | 1518 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) |
1504 // byte3 trailing-byte test | 1519 // byte3 trailing-byte test |
1505 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { | 1520 || UNSAFE.getByte(address++) > (byte) 0xBF) { |
1506 return MALFORMED; | 1521 return MALFORMED; |
1507 } | 1522 } |
1508 } else { | 1523 } else { |
1509 // Four-byte form | 1524 // Four-byte form |
1510 | 1525 |
1511 if (remaining < 3) { | 1526 if (remaining < 3) { |
1512 // Incomplete sequence | 1527 // Incomplete sequence |
1513 return unsafeIncompleteStateFor(address, byte1, remaining); | 1528 return unsafeIncompleteStateFor(address, byte1, remaining); |
1514 } | 1529 } |
1515 remaining -= 3; | 1530 remaining -= 3; |
1516 | 1531 |
1517 final byte byte2 = UnsafeUtil.getByte(address++); | 1532 final byte byte2 = UNSAFE.getByte(address++); |
1518 if (byte2 > (byte) 0xBF | 1533 if (byte2 > (byte) 0xBF |
1519 // Check that 1 <= plane <= 16. Tricky optimized form of: | 1534 // Check that 1 <= plane <= 16. Tricky optimized form of: |
1520 // if (byte1 > (byte) 0xF4 || | 1535 // if (byte1 > (byte) 0xF4 || |
1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || | 1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) | 1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
1523 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 | 1538 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 |
1524 // byte3 trailing-byte test | 1539 // byte3 trailing-byte test |
1525 || UnsafeUtil.getByte(address++) > (byte) 0xBF | 1540 || UNSAFE.getByte(address++) > (byte) 0xBF |
1526 // byte4 trailing-byte test | 1541 // byte4 trailing-byte test |
1527 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { | 1542 || UNSAFE.getByte(address++) > (byte) 0xBF) { |
1528 return MALFORMED; | 1543 return MALFORMED; |
1529 } | 1544 } |
1530 } | 1545 } |
1531 } | 1546 } |
1532 } | 1547 } |
1533 | 1548 |
1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of
fset, | 1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long of
fset, |
1535 int remaining) { | 1550 int remaining) { |
1536 switch (remaining) { | 1551 switch (remaining) { |
1537 case 0: { | 1552 case 0: { |
1538 return incompleteStateFor(byte1); | 1553 return incompleteStateFor(byte1); |
1539 } | 1554 } |
1540 case 1: { | 1555 case 1: { |
1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); | 1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset)); |
1542 } | 1557 } |
1543 case 2: { | 1558 case 2: { |
1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset), | 1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset), |
1545 UnsafeUtil.getByte(bytes, offset + 1)); | 1560 UNSAFE.getByte(bytes, offset + 1)); |
1546 } | 1561 } |
1547 default: { | 1562 default: { |
1548 throw new AssertionError(); | 1563 throw new AssertionError(); |
1549 } | 1564 } |
1550 } | 1565 } |
1551 } | 1566 } |
1552 | 1567 |
1553 private static int unsafeIncompleteStateFor(long address, final int byte1, i
nt remaining) { | 1568 private static int unsafeIncompleteStateFor(long address, final int byte1, i
nt remaining) { |
1554 switch (remaining) { | 1569 switch (remaining) { |
1555 case 0: { | 1570 case 0: { |
1556 return incompleteStateFor(byte1); | 1571 return incompleteStateFor(byte1); |
1557 } | 1572 } |
1558 case 1: { | 1573 case 1: { |
1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); | 1574 return incompleteStateFor(byte1, UNSAFE.getByte(address)); |
1560 } | 1575 } |
1561 case 2: { | 1576 case 2: { |
1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address), | 1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getBy
te(address + 1)); |
1563 UnsafeUtil.getByte(address + 1)); | |
1564 } | 1578 } |
1565 default: { | 1579 default: { |
1566 throw new AssertionError(); | 1580 throw new AssertionError(); |
1567 } | 1581 } |
1568 } | 1582 } |
1569 } | 1583 } |
| 1584 |
| 1585 /** |
| 1586 * Gets the field with the given name within the class, or {@code null} if n
ot found. If |
| 1587 * found, the field is made accessible. |
| 1588 */ |
| 1589 private static Field field(Class<?> clazz, String fieldName) { |
| 1590 Field field; |
| 1591 try { |
| 1592 field = clazz.getDeclaredField(fieldName); |
| 1593 field.setAccessible(true); |
| 1594 } catch (Throwable t) { |
| 1595 // Failed to access the fields. |
| 1596 field = null; |
| 1597 } |
| 1598 logger.log(Level.FINEST, "{0}.{1}: {2}", |
| 1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available"
: "unavailable")}); |
| 1600 return field; |
| 1601 } |
| 1602 |
| 1603 /** |
| 1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.mis
c.Unsafe} is not |
| 1605 * available. |
| 1606 */ |
| 1607 private static long fieldOffset(Field field) { |
| 1608 return field == null || UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(fie
ld); |
| 1609 } |
| 1610 |
| 1611 /** |
| 1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Uns
afe} is not |
| 1613 * available. |
| 1614 */ |
| 1615 private static <T> int byteArrayBaseOffset() { |
| 1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class); |
| 1617 } |
| 1618 |
| 1619 /** |
| 1620 * Gets the offset of the {@code address} field of the given direct {@link B
yteBuffer}. |
| 1621 */ |
| 1622 private static long addressOffset(ByteBuffer buffer) { |
| 1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET); |
| 1624 } |
| 1625 |
| 1626 /** |
| 1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not availab
le on this |
| 1628 * platform. |
| 1629 */ |
| 1630 private static sun.misc.Unsafe getUnsafe() { |
| 1631 sun.misc.Unsafe unsafe = null; |
| 1632 try { |
| 1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun
.misc.Unsafe>() { |
| 1634 @Override |
| 1635 public sun.misc.Unsafe run() throws Exception { |
| 1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class; |
| 1637 |
| 1638 // Check that this platform supports all of the required unsafe meth
ods. |
| 1639 checkRequiredMethods(k); |
| 1640 |
| 1641 for (Field f : k.getDeclaredFields()) { |
| 1642 f.setAccessible(true); |
| 1643 Object x = f.get(null); |
| 1644 if (k.isInstance(x)) { |
| 1645 return k.cast(x); |
| 1646 } |
| 1647 } |
| 1648 // The sun.misc.Unsafe field does not exist. |
| 1649 return null; |
| 1650 } |
| 1651 }); |
| 1652 } catch (Throwable e) { |
| 1653 // Catching Throwable here due to the fact that Google AppEngine raises
NoClassDefFoundError |
| 1654 // for Unsafe. |
| 1655 } |
| 1656 |
| 1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}", |
| 1658 unsafe != null ? "available" : "unavailable"); |
| 1659 return unsafe; |
| 1660 } |
| 1661 |
| 1662 /** |
| 1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are availab
le on this platform. |
| 1664 */ |
| 1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz) |
| 1666 throws NoSuchMethodException, SecurityException { |
| 1667 // Needed for Unsafe byte[] access |
| 1668 clazz.getMethod("arrayBaseOffset", Class.class); |
| 1669 clazz.getMethod("getByte", Object.class, long.class); |
| 1670 clazz.getMethod("putByte", Object.class, long.class, byte.class); |
| 1671 clazz.getMethod("getLong", Object.class, long.class); |
| 1672 |
| 1673 // Needed for Unsafe Direct ByteBuffer access |
| 1674 clazz.getMethod("objectFieldOffset", Field.class); |
| 1675 clazz.getMethod("getByte", long.class); |
| 1676 clazz.getMethod("getLong", Object.class, long.class); |
| 1677 clazz.getMethod("putByte", long.class, byte.class); |
| 1678 clazz.getMethod("getLong", long.class); |
| 1679 } |
1570 } | 1680 } |
1571 | 1681 |
1572 private Utf8() {} | 1682 private Utf8() {} |
1573 } | 1683 } |
OLD | NEW |