OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved. | 2 * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved. |
3 * Copyright (c) 2012 Google, inc. All Rights Reserved. | 3 * Copyright (c) 2012 Google, inc. All Rights Reserved. |
4 * | 4 * |
5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
7 * are met: | 7 * are met: |
8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
(...skipping 20 matching lines...) Expand all Loading... |
31 #define DecodeEscapeSequences_h | 31 #define DecodeEscapeSequences_h |
32 | 32 |
33 #include "wtf/ASCIICType.h" | 33 #include "wtf/ASCIICType.h" |
34 #include "wtf/Allocator.h" | 34 #include "wtf/Allocator.h" |
35 #include "wtf/Assertions.h" | 35 #include "wtf/Assertions.h" |
36 #include "wtf/text/StringBuilder.h" | 36 #include "wtf/text/StringBuilder.h" |
37 #include "wtf/text/TextEncoding.h" | 37 #include "wtf/text/TextEncoding.h" |
38 | 38 |
39 namespace blink { | 39 namespace blink { |
40 | 40 |
41 // See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementatio
ns>. | 41 // See |
| 42 // <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>. |
42 struct Unicode16BitEscapeSequence { | 43 struct Unicode16BitEscapeSequence { |
43 STATIC_ONLY(Unicode16BitEscapeSequence); | 44 STATIC_ONLY(Unicode16BitEscapeSequence); |
44 enum { sequenceSize = 6 }; // e.g. %u26C4 | 45 enum { sequenceSize = 6 }; // e.g. %u26C4 |
45 static size_t findInString(const String& string, size_t startPosition) { | 46 static size_t findInString(const String& string, size_t startPosition) { |
46 return string.find("%u", startPosition); | 47 return string.find("%u", startPosition); |
47 } | 48 } |
48 static size_t findEndOfRun(const String& string, | 49 static size_t findEndOfRun(const String& string, |
49 size_t startPosition, | 50 size_t startPosition, |
50 size_t endPosition) { | 51 size_t endPosition) { |
51 size_t runEnd = startPosition; | 52 size_t runEnd = startPosition; |
52 while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && | 53 while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && |
53 string[runEnd + 1] == 'u' && isASCIIHexDigit(string[runEnd + 2]) && | 54 string[runEnd + 1] == 'u' && isASCIIHexDigit(string[runEnd + 2]) && |
54 isASCIIHexDigit(string[runEnd + 3]) && | 55 isASCIIHexDigit(string[runEnd + 3]) && |
55 isASCIIHexDigit(string[runEnd + 4]) && | 56 isASCIIHexDigit(string[runEnd + 4]) && |
56 isASCIIHexDigit(string[runEnd + 5])) { | 57 isASCIIHexDigit(string[runEnd + 5])) { |
57 runEnd += sequenceSize; | 58 runEnd += sequenceSize; |
58 } | 59 } |
59 return runEnd; | 60 return runEnd; |
60 } | 61 } |
61 | 62 |
62 template <typename CharType> | 63 template <typename CharType> |
63 static String decodeRun(const CharType* run, | 64 static String decodeRun(const CharType* run, |
64 size_t runLength, | 65 size_t runLength, |
65 const WTF::TextEncoding&) { | 66 const WTF::TextEncoding&) { |
66 // Each %u-escape sequence represents a UTF-16 code unit. | 67 // Each %u-escape sequence represents a UTF-16 code unit. See |
67 // See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#ancho
r29>. | 68 // <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>
. |
68 // For 16-bit escape sequences, we know that findEndOfRun() has given us a c
ontiguous run of sequences | 69 // For 16-bit escape sequences, we know that findEndOfRun() has given us a |
69 // without any intervening characters, so decode the run without additional
checks. | 70 // contiguous run of sequences without any intervening characters, so decode |
| 71 // the run without additional checks. |
70 size_t numberOfSequences = runLength / sequenceSize; | 72 size_t numberOfSequences = runLength / sequenceSize; |
71 StringBuilder builder; | 73 StringBuilder builder; |
72 builder.reserveCapacity(numberOfSequences); | 74 builder.reserveCapacity(numberOfSequences); |
73 while (numberOfSequences--) { | 75 while (numberOfSequences--) { |
74 UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | | 76 UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | |
75 (toASCIIHexValue(run[3]) << 8) | | 77 (toASCIIHexValue(run[3]) << 8) | |
76 (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]); | 78 (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]); |
77 builder.append(codeUnit); | 79 builder.append(codeUnit); |
78 run += sequenceSize; | 80 run += sequenceSize; |
79 } | 81 } |
80 return builder.toString(); | 82 return builder.toString(); |
81 } | 83 } |
82 }; | 84 }; |
83 | 85 |
84 struct URLEscapeSequence { | 86 struct URLEscapeSequence { |
85 enum { sequenceSize = 3 }; // e.g. %41 | 87 enum { sequenceSize = 3 }; // e.g. %41 |
86 static size_t findInString(const String& string, size_t startPosition) { | 88 static size_t findInString(const String& string, size_t startPosition) { |
87 return string.find('%', startPosition); | 89 return string.find('%', startPosition); |
88 } | 90 } |
89 static size_t findEndOfRun(const String& string, | 91 static size_t findEndOfRun(const String& string, |
90 size_t startPosition, | 92 size_t startPosition, |
91 size_t endPosition) { | 93 size_t endPosition) { |
92 // Make the simplifying assumption that supported encodings may have up to t
wo unescaped characters | 94 // Make the simplifying assumption that supported encodings may have up to |
93 // in the range 0x40 - 0x7F as the trailing bytes of their sequences which n
eed to be passed into the | 95 // two unescaped characters in the range 0x40 - 0x7F as the trailing bytes |
94 // decoder as part of the run. In other words, we end the run at the first v
alue outside of the | 96 // of their sequences which need to be passed into the decoder as part of |
95 // 0x40 - 0x7F range, after two values in this range, or at a %-sign that do
es not introduce a valid | 97 // the run. In other words, we end the run at the first value outside of the |
96 // escape sequence. | 98 // 0x40 - 0x7F range, after two values in this range, or at a %-sign that |
| 99 // does not introduce a valid escape sequence. |
97 size_t runEnd = startPosition; | 100 size_t runEnd = startPosition; |
98 int numberOfTrailingCharacters = 0; | 101 int numberOfTrailingCharacters = 0; |
99 while (runEnd < endPosition) { | 102 while (runEnd < endPosition) { |
100 if (string[runEnd] == '%') { | 103 if (string[runEnd] == '%') { |
101 if (endPosition - runEnd >= sequenceSize && | 104 if (endPosition - runEnd >= sequenceSize && |
102 isASCIIHexDigit(string[runEnd + 1]) && | 105 isASCIIHexDigit(string[runEnd + 1]) && |
103 isASCIIHexDigit(string[runEnd + 2])) { | 106 isASCIIHexDigit(string[runEnd + 2])) { |
104 runEnd += sequenceSize; | 107 runEnd += sequenceSize; |
105 numberOfTrailingCharacters = 0; | 108 numberOfTrailingCharacters = 0; |
106 } else | 109 } else |
107 break; | 110 break; |
108 } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && | 111 } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && |
109 numberOfTrailingCharacters < 2) { | 112 numberOfTrailingCharacters < 2) { |
110 runEnd += 1; | 113 runEnd += 1; |
111 numberOfTrailingCharacters += 1; | 114 numberOfTrailingCharacters += 1; |
112 } else | 115 } else |
113 break; | 116 break; |
114 } | 117 } |
115 return runEnd; | 118 return runEnd; |
116 } | 119 } |
117 | 120 |
118 template <typename CharType> | 121 template <typename CharType> |
119 static String decodeRun(const CharType* run, | 122 static String decodeRun(const CharType* run, |
120 size_t runLength, | 123 size_t runLength, |
121 const WTF::TextEncoding& encoding) { | 124 const WTF::TextEncoding& encoding) { |
122 // For URL escape sequences, we know that findEndOfRun() has given us a run
where every %-sign introduces | 125 // For URL escape sequences, we know that findEndOfRun() has given us a run |
123 // a valid escape sequence, but there may be characters between the sequence
s. | 126 // where every %-sign introduces a valid escape sequence, but there may be |
| 127 // characters between the sequences. |
124 Vector<char, 512> buffer; | 128 Vector<char, 512> buffer; |
125 buffer.resize( | 129 buffer.resize( |
126 runLength); // Unescaping hex sequences only makes the length smaller. | 130 runLength); // Unescaping hex sequences only makes the length smaller. |
127 char* p = buffer.data(); | 131 char* p = buffer.data(); |
128 const CharType* runEnd = run + runLength; | 132 const CharType* runEnd = run + runLength; |
129 while (run < runEnd) { | 133 while (run < runEnd) { |
130 if (run[0] == '%') { | 134 if (run[0] == '%') { |
131 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]); | 135 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]); |
132 run += sequenceSize; | 136 run += sequenceSize; |
133 } else { | 137 } else { |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
177 result.append(decoded); | 181 result.append(decoded); |
178 decodedPosition = encodedRunEnd; | 182 decodedPosition = encodedRunEnd; |
179 } | 183 } |
180 result.append(string, decodedPosition, length - decodedPosition); | 184 result.append(string, decodedPosition, length - decodedPosition); |
181 return result.toString(); | 185 return result.toString(); |
182 } | 186 } |
183 | 187 |
184 } // namespace blink | 188 } // namespace blink |
185 | 189 |
186 #endif // DecodeEscapeSequences_h | 190 #endif // DecodeEscapeSequences_h |
OLD | NEW |