| OLD | NEW | 
 | (Empty) | 
|    1 // Copyright 2016 The Chromium Authors. All rights reserved. |  | 
|    2 // Use of this source code is governed by a BSD-style license that can be |  | 
|    3 // found in the LICENSE file. |  | 
|    4  |  | 
|    5 #include "platform/inspector_protocol/InspectorProtocol.h" |  | 
|    6  |  | 
|    7 #include <algorithm> |  | 
|    8 #include <cctype> |  | 
|    9 #include <cstdio> |  | 
|   10 #include <locale> |  | 
|   11  |  | 
|   12 namespace blink { |  | 
|   13 namespace protocol { |  | 
|   14  |  | 
|   15 const UChar replacementCharacter = 0xFFFD; |  | 
|   16 using UChar32 = uint32_t; |  | 
|   17  |  | 
|   18 inline int inlineUTF8SequenceLengthNonASCII(char b0) |  | 
|   19 { |  | 
|   20     if ((b0 & 0xC0) != 0xC0) |  | 
|   21         return 0; |  | 
|   22     if ((b0 & 0xE0) == 0xC0) |  | 
|   23         return 2; |  | 
|   24     if ((b0 & 0xF0) == 0xE0) |  | 
|   25         return 3; |  | 
|   26     if ((b0 & 0xF8) == 0xF0) |  | 
|   27         return 4; |  | 
|   28     return 0; |  | 
|   29 } |  | 
|   30  |  | 
|   31 inline int inlineUTF8SequenceLength(char b0) |  | 
|   32 { |  | 
|   33     return String16::isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |  | 
|   34 } |  | 
|   35  |  | 
|   36 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |  | 
|   37 // into the first byte, depending on how many bytes follow.  There are |  | 
|   38 // as many entries in this table as there are UTF-8 sequence types. |  | 
|   39 // (I.e., one byte sequence, two byte... etc.). Remember that sequences |  | 
|   40 // for *legal* UTF-8 will be 4 or fewer bytes total. |  | 
|   41 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
     F8, 0xFC }; |  | 
|   42  |  | 
|   43 typedef enum { |  | 
|   44     conversionOK, // conversion successful |  | 
|   45     sourceExhausted, // partial character in source, but hit end |  | 
|   46     targetExhausted, // insuff. room in target for conversion |  | 
|   47     sourceIllegal // source sequence is illegal/malformed |  | 
|   48 } ConversionResult; |  | 
|   49  |  | 
|   50 ConversionResult convertUTF16ToUTF8( |  | 
|   51     const UChar** sourceStart, const UChar* sourceEnd, |  | 
|   52     char** targetStart, char* targetEnd, bool strict) |  | 
|   53 { |  | 
|   54     ConversionResult result = conversionOK; |  | 
|   55     const UChar* source = *sourceStart; |  | 
|   56     char* target = *targetStart; |  | 
|   57     while (source < sourceEnd) { |  | 
|   58         UChar32 ch; |  | 
|   59         unsigned short bytesToWrite = 0; |  | 
|   60         const UChar32 byteMask = 0xBF; |  | 
|   61         const UChar32 byteMark = 0x80; |  | 
|   62         const UChar* oldSource = source; // In case we have to back up because o
     f target overflow. |  | 
|   63         ch = static_cast<unsigned short>(*source++); |  | 
|   64         // If we have a surrogate pair, convert to UChar32 first. |  | 
|   65         if (ch >= 0xD800 && ch <= 0xDBFF) { |  | 
|   66             // If the 16 bits following the high surrogate are in the source buf
     fer... |  | 
|   67             if (source < sourceEnd) { |  | 
|   68                 UChar32 ch2 = static_cast<unsigned short>(*source); |  | 
|   69                 // If it's a low surrogate, convert to UChar32. |  | 
|   70                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |  | 
|   71                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; |  | 
|   72                     ++source; |  | 
|   73                 } else if (strict) { // it's an unpaired high surrogate |  | 
|   74                     --source; // return to the illegal value itself |  | 
|   75                     result = sourceIllegal; |  | 
|   76                     break; |  | 
|   77                 } |  | 
|   78             } else { // We don't have the 16 bits following the high surrogate. |  | 
|   79                 --source; // return to the high surrogate |  | 
|   80                 result = sourceExhausted; |  | 
|   81                 break; |  | 
|   82             } |  | 
|   83         } else if (strict) { |  | 
|   84             // UTF-16 surrogate values are illegal in UTF-32 |  | 
|   85             if (ch >= 0xDC00 && ch <= 0xDFFF) { |  | 
|   86                 --source; // return to the illegal value itself |  | 
|   87                 result = sourceIllegal; |  | 
|   88                 break; |  | 
|   89             } |  | 
|   90         } |  | 
|   91         // Figure out how many bytes the result will require |  | 
|   92         if (ch < (UChar32)0x80) { |  | 
|   93             bytesToWrite = 1; |  | 
|   94         } else if (ch < (UChar32)0x800) { |  | 
|   95             bytesToWrite = 2; |  | 
|   96         } else if (ch < (UChar32)0x10000) { |  | 
|   97             bytesToWrite = 3; |  | 
|   98         } else if (ch < (UChar32)0x110000) { |  | 
|   99             bytesToWrite = 4; |  | 
|  100         } else { |  | 
|  101             bytesToWrite = 3; |  | 
|  102             ch = replacementCharacter; |  | 
|  103         } |  | 
|  104  |  | 
|  105         target += bytesToWrite; |  | 
|  106         if (target > targetEnd) { |  | 
|  107             source = oldSource; // Back up source pointer! |  | 
|  108             target -= bytesToWrite; |  | 
|  109             result = targetExhausted; |  | 
|  110             break; |  | 
|  111         } |  | 
|  112         switch (bytesToWrite) { // note: everything falls through. |  | 
|  113         case 4: |  | 
|  114             *--target = (char)((ch | byteMark) & byteMask); |  | 
|  115             ch >>= 6; |  | 
|  116         case 3: |  | 
|  117             *--target = (char)((ch | byteMark) & byteMask); |  | 
|  118             ch >>= 6; |  | 
|  119         case 2: |  | 
|  120             *--target = (char)((ch | byteMark) & byteMask); |  | 
|  121             ch >>= 6; |  | 
|  122         case 1: |  | 
|  123             *--target =  (char)(ch | firstByteMark[bytesToWrite]); |  | 
|  124         } |  | 
|  125         target += bytesToWrite; |  | 
|  126     } |  | 
|  127     *sourceStart = source; |  | 
|  128     *targetStart = target; |  | 
|  129     return result; |  | 
|  130 } |  | 
|  131  |  | 
|  132 /** |  | 
|  133  * Is this code point a BMP code point (U+0000..U+ffff)? |  | 
|  134  * @param c 32-bit code point |  | 
|  135  * @return TRUE or FALSE |  | 
|  136  * @stable ICU 2.8 |  | 
|  137  */ |  | 
|  138 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) |  | 
|  139  |  | 
|  140 /** |  | 
|  141  * Is this code point a supplementary code point (U+10000..U+10ffff)? |  | 
|  142  * @param c 32-bit code point |  | 
|  143  * @return TRUE or FALSE |  | 
|  144  * @stable ICU 2.8 |  | 
|  145  */ |  | 
|  146 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff) |  | 
|  147  |  | 
|  148 /** |  | 
|  149  * Is this code point a surrogate (U+d800..U+dfff)? |  | 
|  150  * @param c 32-bit code point |  | 
|  151  * @return TRUE or FALSE |  | 
|  152  * @stable ICU 2.4 |  | 
|  153  */ |  | 
|  154 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) |  | 
|  155  |  | 
|  156 /** |  | 
|  157  * Get the lead surrogate (0xd800..0xdbff) for a |  | 
|  158  * supplementary code point (0x10000..0x10ffff). |  | 
|  159  * @param supplementary 32-bit code point (U+10000..U+10ffff) |  | 
|  160  * @return lead surrogate (U+d800..U+dbff) for supplementary |  | 
|  161  * @stable ICU 2.4 |  | 
|  162  */ |  | 
|  163 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) |  | 
|  164  |  | 
|  165 /** |  | 
|  166  * Get the trail surrogate (0xdc00..0xdfff) for a |  | 
|  167  * supplementary code point (0x10000..0x10ffff). |  | 
|  168  * @param supplementary 32-bit code point (U+10000..U+10ffff) |  | 
|  169  * @return trail surrogate (U+dc00..U+dfff) for supplementary |  | 
|  170  * @stable ICU 2.4 |  | 
|  171  */ |  | 
|  172 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00) |  | 
|  173  |  | 
|  174 // This must be called with the length pre-determined by the first byte. |  | 
|  175 // If presented with a length > 4, this returns false.  The Unicode |  | 
|  176 // definition of UTF-8 goes up to 4-byte sequences. |  | 
|  177 static bool isLegalUTF8(const unsigned char* source, int length) |  | 
|  178 { |  | 
|  179     unsigned char a; |  | 
|  180     const unsigned char* srcptr = source + length; |  | 
|  181     switch (length) { |  | 
|  182     default: |  | 
|  183         return false; |  | 
|  184     // Everything else falls through when "true"... |  | 
|  185     case 4: |  | 
|  186         if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |  | 
|  187             return false; |  | 
|  188     case 3: |  | 
|  189         if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |  | 
|  190             return false; |  | 
|  191     case 2: |  | 
|  192         if ((a = (*--srcptr)) > 0xBF) |  | 
|  193             return false; |  | 
|  194  |  | 
|  195         // no fall-through in this inner switch |  | 
|  196         switch (*source) { |  | 
|  197         case 0xE0: |  | 
|  198             if (a < 0xA0) |  | 
|  199                 return false; |  | 
|  200             break; |  | 
|  201         case 0xED: |  | 
|  202             if (a > 0x9F) |  | 
|  203                 return false; |  | 
|  204             break; |  | 
|  205         case 0xF0: |  | 
|  206             if (a < 0x90) |  | 
|  207                 return false; |  | 
|  208             break; |  | 
|  209         case 0xF4: |  | 
|  210             if (a > 0x8F) |  | 
|  211                 return false; |  | 
|  212             break; |  | 
|  213         default: |  | 
|  214             if (a < 0x80) |  | 
|  215                 return false; |  | 
|  216         } |  | 
|  217  |  | 
|  218     case 1: |  | 
|  219         if (*source >= 0x80 && *source < 0xC2) |  | 
|  220             return false; |  | 
|  221     } |  | 
|  222     if (*source > 0xF4) |  | 
|  223         return false; |  | 
|  224     return true; |  | 
|  225 } |  | 
|  226  |  | 
|  227 // Magic values subtracted from a buffer value during UTF8 conversion. |  | 
|  228 // This table contains as many values as there might be trailing bytes |  | 
|  229 // in a UTF-8 sequence. |  | 
|  230 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20
     80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8
     2082080UL) }; |  | 
|  231  |  | 
|  232 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) |  | 
|  233 { |  | 
|  234     UChar32 character = 0; |  | 
|  235  |  | 
|  236     // The cases all fall through. |  | 
|  237     switch (length) { |  | 
|  238     case 6: |  | 
|  239         character += static_cast<unsigned char>(*sequence++); |  | 
|  240         character <<= 6; |  | 
|  241     case 5: |  | 
|  242         character += static_cast<unsigned char>(*sequence++); |  | 
|  243         character <<= 6; |  | 
|  244     case 4: |  | 
|  245         character += static_cast<unsigned char>(*sequence++); |  | 
|  246         character <<= 6; |  | 
|  247     case 3: |  | 
|  248         character += static_cast<unsigned char>(*sequence++); |  | 
|  249         character <<= 6; |  | 
|  250     case 2: |  | 
|  251         character += static_cast<unsigned char>(*sequence++); |  | 
|  252         character <<= 6; |  | 
|  253     case 1: |  | 
|  254         character += static_cast<unsigned char>(*sequence++); |  | 
|  255     } |  | 
|  256  |  | 
|  257     return character - offsetsFromUTF8[length - 1]; |  | 
|  258 } |  | 
|  259  |  | 
|  260 ConversionResult convertUTF8ToUTF16( |  | 
|  261     const char** sourceStart, const char* sourceEnd, |  | 
|  262     UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) |  | 
|  263 { |  | 
|  264     ConversionResult result = conversionOK; |  | 
|  265     const char* source = *sourceStart; |  | 
|  266     UChar* target = *targetStart; |  | 
|  267     UChar orAllData = 0; |  | 
|  268     while (source < sourceEnd) { |  | 
|  269         int utf8SequenceLength = inlineUTF8SequenceLength(*source); |  | 
|  270         if (sourceEnd - source < utf8SequenceLength)  { |  | 
|  271             result = sourceExhausted; |  | 
|  272             break; |  | 
|  273         } |  | 
|  274         // Do this check whether lenient or strict |  | 
|  275         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
     uenceLength)) { |  | 
|  276             result = sourceIllegal; |  | 
|  277             break; |  | 
|  278         } |  | 
|  279  |  | 
|  280         UChar32 character = readUTF8Sequence(source, utf8SequenceLength); |  | 
|  281  |  | 
|  282         if (target >= targetEnd) { |  | 
|  283             source -= utf8SequenceLength; // Back up source pointer! |  | 
|  284             result = targetExhausted; |  | 
|  285             break; |  | 
|  286         } |  | 
|  287  |  | 
|  288         if (U_IS_BMP(character)) { |  | 
|  289             // UTF-16 surrogate values are illegal in UTF-32 |  | 
|  290             if (U_IS_SURROGATE(character)) { |  | 
|  291                 if (strict) { |  | 
|  292                     source -= utf8SequenceLength; // return to the illegal value
      itself |  | 
|  293                     result = sourceIllegal; |  | 
|  294                     break; |  | 
|  295                 } |  | 
|  296                 *target++ = replacementCharacter; |  | 
|  297                 orAllData |= replacementCharacter; |  | 
|  298             } else { |  | 
|  299                 *target++ = static_cast<UChar>(character); // normal case |  | 
|  300                 orAllData |= character; |  | 
|  301             } |  | 
|  302         } else if (U_IS_SUPPLEMENTARY(character)) { |  | 
|  303             // target is a character in range 0xFFFF - 0x10FFFF |  | 
|  304             if (target + 1 >= targetEnd) { |  | 
|  305                 source -= utf8SequenceLength; // Back up source pointer! |  | 
|  306                 result = targetExhausted; |  | 
|  307                 break; |  | 
|  308             } |  | 
|  309             *target++ = U16_LEAD(character); |  | 
|  310             *target++ = U16_TRAIL(character); |  | 
|  311             orAllData = 0xffff; |  | 
|  312         } else { |  | 
|  313             if (strict) { |  | 
|  314                 source -= utf8SequenceLength; // return to the start |  | 
|  315                 result = sourceIllegal; |  | 
|  316                 break; // Bail out; shouldn't continue |  | 
|  317             } else { |  | 
|  318                 *target++ = replacementCharacter; |  | 
|  319                 orAllData |= replacementCharacter; |  | 
|  320             } |  | 
|  321         } |  | 
|  322     } |  | 
|  323     *sourceStart = source; |  | 
|  324     *targetStart = target; |  | 
|  325  |  | 
|  326     if (sourceAllASCII) |  | 
|  327         *sourceAllASCII = !(orAllData & ~0x7f); |  | 
|  328  |  | 
|  329     return result; |  | 
|  330 } |  | 
|  331  |  | 
|  332 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec
     k room is available. |  | 
|  333 static inline void putUTF8Triple(char*& buffer, UChar ch) |  | 
|  334 { |  | 
|  335     DCHECK_GE(ch, 0x0800); |  | 
|  336     *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); |  | 
|  337     *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); |  | 
|  338     *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); |  | 
|  339 } |  | 
|  340  |  | 
|  341 String16 String16::fromUTF8(const char* stringStart, size_t length) |  | 
|  342 { |  | 
|  343     if (!stringStart || !length) |  | 
|  344         return String16(); |  | 
|  345  |  | 
|  346     std::vector<UChar> buffer(length); |  | 
|  347     UChar* bufferStart = buffer.data(); |  | 
|  348  |  | 
|  349     UChar* bufferCurrent = bufferStart; |  | 
|  350     const char* stringCurrent = stringStart; |  | 
|  351     if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
      bufferCurrent + buffer.size(), 0, true) != conversionOK) |  | 
|  352         return String16(); |  | 
|  353  |  | 
|  354     unsigned utf16Length = bufferCurrent - bufferStart; |  | 
|  355     return String16(bufferStart, utf16Length); |  | 
|  356 } |  | 
|  357  |  | 
|  358 std::string String16::utf8() const |  | 
|  359 { |  | 
|  360     unsigned length = this->length(); |  | 
|  361  |  | 
|  362     if (!length) |  | 
|  363         return std::string(""); |  | 
|  364  |  | 
|  365     // Allocate a buffer big enough to hold all the characters |  | 
|  366     // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). |  | 
|  367     // Optimization ideas, if we find this function is hot: |  | 
|  368     //  * We could speculatively create a CStringBuffer to contain 'length' |  | 
|  369     //    characters, and resize if necessary (i.e. if the buffer contains |  | 
|  370     //    non-ascii characters). (Alternatively, scan the buffer first for |  | 
|  371     //    ascii characters, so we know this will be sufficient). |  | 
|  372     //  * We could allocate a CStringBuffer with an appropriate size to |  | 
|  373     //    have a good chance of being able to write the string into the |  | 
|  374     //    buffer without reallocing (say, 1.5 x length). |  | 
|  375     if (length > std::numeric_limits<unsigned>::max() / 3) |  | 
|  376         return std::string(); |  | 
|  377     std::vector<char> bufferVector(length * 3); |  | 
|  378     char* buffer = bufferVector.data(); |  | 
|  379     const UChar* characters = m_impl.data(); |  | 
|  380  |  | 
|  381     ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt
     h, &buffer, buffer + bufferVector.size(), false); |  | 
|  382     DCHECK(result != targetExhausted); // (length * 3) should be sufficient for 
     any conversion |  | 
|  383  |  | 
|  384     // Only produced from strict conversion. |  | 
|  385     DCHECK(result != sourceIllegal); |  | 
|  386  |  | 
|  387     // Check for an unconverted high surrogate. |  | 
|  388     if (result == sourceExhausted) { |  | 
|  389         // This should be one unpaired high surrogate. Treat it the same |  | 
|  390         // was as an unpaired high surrogate would have been handled in |  | 
|  391         // the middle of a string with non-strict conversion - which is |  | 
|  392         // to say, simply encode it to UTF-8. |  | 
|  393         DCHECK((characters + 1) == (m_impl.data() + length)); |  | 
|  394         DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); |  | 
|  395         // There should be room left, since one UChar hasn't been |  | 
|  396         // converted. |  | 
|  397         DCHECK((buffer + 3) <= (buffer + bufferVector.size())); |  | 
|  398         putUTF8Triple(buffer, *characters); |  | 
|  399     } |  | 
|  400  |  | 
|  401     return std::string(bufferVector.data(), buffer - bufferVector.data()); |  | 
|  402 } |  | 
|  403  |  | 
|  404 } // namespace protocol |  | 
|  405 } // namespace blink |  | 
| OLD | NEW |