| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved. | 2 * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved. |
| 3 * Copyright (c) 2012 Google, inc. All Rights Reserved. | 3 * Copyright (c) 2012 Google, inc. All Rights Reserved. |
| 4 * | 4 * |
| 5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
| 6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
| 7 * are met: | 7 * are met: |
| 8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
| 10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
| (...skipping 20 matching lines...) Expand all Loading... |
| 31 #define DecodeEscapeSequences_h | 31 #define DecodeEscapeSequences_h |
| 32 | 32 |
| 33 #include "wtf/ASCIICType.h" | 33 #include "wtf/ASCIICType.h" |
| 34 #include "wtf/Allocator.h" | 34 #include "wtf/Allocator.h" |
| 35 #include "wtf/Assertions.h" | 35 #include "wtf/Assertions.h" |
| 36 #include "wtf/text/StringBuilder.h" | 36 #include "wtf/text/StringBuilder.h" |
| 37 #include "wtf/text/TextEncoding.h" | 37 #include "wtf/text/TextEncoding.h" |
| 38 | 38 |
| 39 namespace blink { | 39 namespace blink { |
| 40 | 40 |
| 41 // See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementatio
ns>. | 41 // See |
| 42 // <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>. |
| 42 struct Unicode16BitEscapeSequence { | 43 struct Unicode16BitEscapeSequence { |
| 43 STATIC_ONLY(Unicode16BitEscapeSequence); | 44 STATIC_ONLY(Unicode16BitEscapeSequence); |
| 44 enum { sequenceSize = 6 }; // e.g. %u26C4 | 45 enum { sequenceSize = 6 }; // e.g. %u26C4 |
| 45 static size_t findInString(const String& string, size_t startPosition) { | 46 static size_t findInString(const String& string, size_t startPosition) { |
| 46 return string.find("%u", startPosition); | 47 return string.find("%u", startPosition); |
| 47 } | 48 } |
| 48 static size_t findEndOfRun(const String& string, | 49 static size_t findEndOfRun(const String& string, |
| 49 size_t startPosition, | 50 size_t startPosition, |
| 50 size_t endPosition) { | 51 size_t endPosition) { |
| 51 size_t runEnd = startPosition; | 52 size_t runEnd = startPosition; |
| 52 while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && | 53 while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && |
| 53 string[runEnd + 1] == 'u' && isASCIIHexDigit(string[runEnd + 2]) && | 54 string[runEnd + 1] == 'u' && isASCIIHexDigit(string[runEnd + 2]) && |
| 54 isASCIIHexDigit(string[runEnd + 3]) && | 55 isASCIIHexDigit(string[runEnd + 3]) && |
| 55 isASCIIHexDigit(string[runEnd + 4]) && | 56 isASCIIHexDigit(string[runEnd + 4]) && |
| 56 isASCIIHexDigit(string[runEnd + 5])) { | 57 isASCIIHexDigit(string[runEnd + 5])) { |
| 57 runEnd += sequenceSize; | 58 runEnd += sequenceSize; |
| 58 } | 59 } |
| 59 return runEnd; | 60 return runEnd; |
| 60 } | 61 } |
| 61 | 62 |
| 62 template <typename CharType> | 63 template <typename CharType> |
| 63 static String decodeRun(const CharType* run, | 64 static String decodeRun(const CharType* run, |
| 64 size_t runLength, | 65 size_t runLength, |
| 65 const WTF::TextEncoding&) { | 66 const WTF::TextEncoding&) { |
| 66 // Each %u-escape sequence represents a UTF-16 code unit. | 67 // Each %u-escape sequence represents a UTF-16 code unit. See |
| 67 // See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#ancho
r29>. | 68 // <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>
. |
| 68 // For 16-bit escape sequences, we know that findEndOfRun() has given us a c
ontiguous run of sequences | 69 // For 16-bit escape sequences, we know that findEndOfRun() has given us a |
| 69 // without any intervening characters, so decode the run without additional
checks. | 70 // contiguous run of sequences without any intervening characters, so decode |
| 71 // the run without additional checks. |
| 70 size_t numberOfSequences = runLength / sequenceSize; | 72 size_t numberOfSequences = runLength / sequenceSize; |
| 71 StringBuilder builder; | 73 StringBuilder builder; |
| 72 builder.reserveCapacity(numberOfSequences); | 74 builder.reserveCapacity(numberOfSequences); |
| 73 while (numberOfSequences--) { | 75 while (numberOfSequences--) { |
| 74 UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | | 76 UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | |
| 75 (toASCIIHexValue(run[3]) << 8) | | 77 (toASCIIHexValue(run[3]) << 8) | |
| 76 (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]); | 78 (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]); |
| 77 builder.append(codeUnit); | 79 builder.append(codeUnit); |
| 78 run += sequenceSize; | 80 run += sequenceSize; |
| 79 } | 81 } |
| 80 return builder.toString(); | 82 return builder.toString(); |
| 81 } | 83 } |
| 82 }; | 84 }; |
| 83 | 85 |
| 84 struct URLEscapeSequence { | 86 struct URLEscapeSequence { |
| 85 enum { sequenceSize = 3 }; // e.g. %41 | 87 enum { sequenceSize = 3 }; // e.g. %41 |
| 86 static size_t findInString(const String& string, size_t startPosition) { | 88 static size_t findInString(const String& string, size_t startPosition) { |
| 87 return string.find('%', startPosition); | 89 return string.find('%', startPosition); |
| 88 } | 90 } |
| 89 static size_t findEndOfRun(const String& string, | 91 static size_t findEndOfRun(const String& string, |
| 90 size_t startPosition, | 92 size_t startPosition, |
| 91 size_t endPosition) { | 93 size_t endPosition) { |
| 92 // Make the simplifying assumption that supported encodings may have up to t
wo unescaped characters | 94 // Make the simplifying assumption that supported encodings may have up to |
| 93 // in the range 0x40 - 0x7F as the trailing bytes of their sequences which n
eed to be passed into the | 95 // two unescaped characters in the range 0x40 - 0x7F as the trailing bytes |
| 94 // decoder as part of the run. In other words, we end the run at the first v
alue outside of the | 96 // of their sequences which need to be passed into the decoder as part of |
| 95 // 0x40 - 0x7F range, after two values in this range, or at a %-sign that do
es not introduce a valid | 97 // the run. In other words, we end the run at the first value outside of the |
| 96 // escape sequence. | 98 // 0x40 - 0x7F range, after two values in this range, or at a %-sign that |
| 99 // does not introduce a valid escape sequence. |
| 97 size_t runEnd = startPosition; | 100 size_t runEnd = startPosition; |
| 98 int numberOfTrailingCharacters = 0; | 101 int numberOfTrailingCharacters = 0; |
| 99 while (runEnd < endPosition) { | 102 while (runEnd < endPosition) { |
| 100 if (string[runEnd] == '%') { | 103 if (string[runEnd] == '%') { |
| 101 if (endPosition - runEnd >= sequenceSize && | 104 if (endPosition - runEnd >= sequenceSize && |
| 102 isASCIIHexDigit(string[runEnd + 1]) && | 105 isASCIIHexDigit(string[runEnd + 1]) && |
| 103 isASCIIHexDigit(string[runEnd + 2])) { | 106 isASCIIHexDigit(string[runEnd + 2])) { |
| 104 runEnd += sequenceSize; | 107 runEnd += sequenceSize; |
| 105 numberOfTrailingCharacters = 0; | 108 numberOfTrailingCharacters = 0; |
| 106 } else | 109 } else |
| 107 break; | 110 break; |
| 108 } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && | 111 } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && |
| 109 numberOfTrailingCharacters < 2) { | 112 numberOfTrailingCharacters < 2) { |
| 110 runEnd += 1; | 113 runEnd += 1; |
| 111 numberOfTrailingCharacters += 1; | 114 numberOfTrailingCharacters += 1; |
| 112 } else | 115 } else |
| 113 break; | 116 break; |
| 114 } | 117 } |
| 115 return runEnd; | 118 return runEnd; |
| 116 } | 119 } |
| 117 | 120 |
| 118 template <typename CharType> | 121 template <typename CharType> |
| 119 static String decodeRun(const CharType* run, | 122 static String decodeRun(const CharType* run, |
| 120 size_t runLength, | 123 size_t runLength, |
| 121 const WTF::TextEncoding& encoding) { | 124 const WTF::TextEncoding& encoding) { |
| 122 // For URL escape sequences, we know that findEndOfRun() has given us a run
where every %-sign introduces | 125 // For URL escape sequences, we know that findEndOfRun() has given us a run |
| 123 // a valid escape sequence, but there may be characters between the sequence
s. | 126 // where every %-sign introduces a valid escape sequence, but there may be |
| 127 // characters between the sequences. |
| 124 Vector<char, 512> buffer; | 128 Vector<char, 512> buffer; |
| 125 buffer.resize( | 129 buffer.resize( |
| 126 runLength); // Unescaping hex sequences only makes the length smaller. | 130 runLength); // Unescaping hex sequences only makes the length smaller. |
| 127 char* p = buffer.data(); | 131 char* p = buffer.data(); |
| 128 const CharType* runEnd = run + runLength; | 132 const CharType* runEnd = run + runLength; |
| 129 while (run < runEnd) { | 133 while (run < runEnd) { |
| 130 if (run[0] == '%') { | 134 if (run[0] == '%') { |
| 131 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]); | 135 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]); |
| 132 run += sequenceSize; | 136 run += sequenceSize; |
| 133 } else { | 137 } else { |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 177 result.append(decoded); | 181 result.append(decoded); |
| 178 decodedPosition = encodedRunEnd; | 182 decodedPosition = encodedRunEnd; |
| 179 } | 183 } |
| 180 result.append(string, decodedPosition, length - decodedPosition); | 184 result.append(string, decodedPosition, length - decodedPosition); |
| 181 return result.toString(); | 185 return result.toString(); |
| 182 } | 186 } |
| 183 | 187 |
| 184 } // namespace blink | 188 } // namespace blink |
| 185 | 189 |
| 186 #endif // DecodeEscapeSequences_h | 190 #endif // DecodeEscapeSequences_h |
| OLD | NEW |