Index: third_party/protobuf/java/src/main/java/com/google/protobuf/Internal.java |
=================================================================== |
--- third_party/protobuf/java/src/main/java/com/google/protobuf/Internal.java (revision 216642) |
+++ third_party/protobuf/java/src/main/java/com/google/protobuf/Internal.java (working copy) |
@@ -103,85 +103,32 @@ |
* Helper called by generated code to determine if a byte array is a valid |
* UTF-8 encoded string such that the original bytes can be converted to |
* a String object and then back to a byte array round tripping the bytes |
- * without loss. |
- * <p> |
- * This is inspired by UTF_8.java in sun.nio.cs. |
+ * without loss. More precisely, returns {@code true} whenever: |
+ * <pre> {@code |
+ * Arrays.equals(byteString.toByteArray(), |
+ * new String(byteString.toByteArray(), "UTF-8").getBytes("UTF-8")) |
+ * }</pre> |
* |
+ * <p>This method rejects "overlong" byte sequences, as well as |
+ * 3-byte sequences that would map to a surrogate character, in |
+ * accordance with the restricted definition of UTF-8 introduced in |
+ * Unicode 3.1. Note that the UTF-8 decoder included in Oracle's |
+ * JDK has been modified to also reject "overlong" byte sequences, |
+ * but currently (2011) still accepts 3-byte surrogate character |
+ * byte sequences. |
+ * |
+ * <p>See the Unicode Standard,</br> |
+ * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> |
+ * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>. |
+ * |
+ * <p>As of 2011-02, this method simply returns the result of {@link |
+ * ByteString#isValidUtf8()}. Calling that method directly is preferred. |
+ * |
* @param byteString the string to check |
* @return whether the byte array is round trippable |
*/ |
public static boolean isValidUtf8(ByteString byteString) { |
- int index = 0; |
- int size = byteString.size(); |
- // To avoid the masking, we could change this to use bytes; |
- // Then X > 0xC2 gets turned into X < -0xC2; X < 0x80 |
- // gets turned into X >= 0, etc. |
- |
- while (index < size) { |
- int byte1 = byteString.byteAt(index++) & 0xFF; |
- if (byte1 < 0x80) { |
- // fast loop for single bytes |
- continue; |
- |
- // we know from this point on that we have 2-4 byte forms |
- } else if (byte1 < 0xC2 || byte1 > 0xF4) { |
- // catch illegal first bytes: < C2 or > F4 |
- return false; |
- } |
- if (index >= size) { |
- // fail if we run out of bytes |
- return false; |
- } |
- int byte2 = byteString.byteAt(index++) & 0xFF; |
- if (byte2 < 0x80 || byte2 > 0xBF) { |
- // general trail-byte test |
- return false; |
- } |
- if (byte1 <= 0xDF) { |
- // two-byte form; general trail-byte test is sufficient |
- continue; |
- } |
- |
- // we know from this point on that we have 3 or 4 byte forms |
- if (index >= size) { |
- // fail if we run out of bytes |
- return false; |
- } |
- int byte3 = byteString.byteAt(index++) & 0xFF; |
- if (byte3 < 0x80 || byte3 > 0xBF) { |
- // general trail-byte test |
- return false; |
- } |
- if (byte1 <= 0xEF) { |
- // three-byte form. Vastly more frequent than four-byte forms |
- // The following has an extra test, but not worth restructuring |
- if (byte1 == 0xE0 && byte2 < 0xA0 || |
- byte1 == 0xED && byte2 > 0x9F) { |
- // check special cases of byte2 |
- return false; |
- } |
- |
- } else { |
- // four-byte form |
- |
- if (index >= size) { |
- // fail if we run out of bytes |
- return false; |
- } |
- int byte4 = byteString.byteAt(index++) & 0xFF; |
- if (byte4 < 0x80 || byte4 > 0xBF) { |
- // general trail-byte test |
- return false; |
- } |
- // The following has an extra test, but not worth restructuring |
- if (byte1 == 0xF0 && byte2 < 0x90 || |
- byte1 == 0xF4 && byte2 > 0x8F) { |
- // check special cases of byte2 |
- return false; |
- } |
- } |
- } |
- return true; |
+ return byteString.isValidUtf8(); |
} |
/** |