Index: gcc/libcpp/charset.c |
diff --git a/gcc/libcpp/charset.c b/gcc/libcpp/charset.c |
index e743b1e277fcd412972bde2d70871697075ca028..304efc8de0d55672c80fec7119136a4b351e3531 100644 |
--- a/gcc/libcpp/charset.c |
+++ b/gcc/libcpp/charset.c |
@@ -169,7 +169,7 @@ static inline int |
one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp, |
cppchar_t *cp) |
{ |
- static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 }; |
+ static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 }; |
static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
cppchar_t c; |
@@ -721,6 +721,8 @@ cpp_init_iconv (cpp_reader *pfile) |
pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET); |
pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision); |
+ pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET); |
+ pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision); |
pfile->char16_cset_desc = init_iconv_desc (pfile, |
be ? "UTF-16BE" : "UTF-16LE", |
SOURCE_CHARSET); |
@@ -741,6 +743,12 @@ _cpp_destroy_iconv (cpp_reader *pfile) |
{ |
if (pfile->narrow_cset_desc.func == convert_using_iconv) |
iconv_close (pfile->narrow_cset_desc.cd); |
+ if (pfile->utf8_cset_desc.func == convert_using_iconv) |
+ iconv_close (pfile->utf8_cset_desc.cd); |
+ if (pfile->char16_cset_desc.func == convert_using_iconv) |
+ iconv_close (pfile->char16_cset_desc.cd); |
+ if (pfile->char32_cset_desc.func == convert_using_iconv) |
+ iconv_close (pfile->char32_cset_desc.cd); |
if (pfile->wide_cset_desc.func == convert_using_iconv) |
iconv_close (pfile->wide_cset_desc.cd); |
} |
@@ -948,10 +956,16 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, |
ISO/IEC 10646 is NNNNNNNN; the character designated by the |
universal character name \uNNNN is that character whose character |
short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value |
- for a universal character name is less than 0x20 or in the range |
- 0x7F-0x9F (inclusive), or if the universal character name |
- designates a character in the basic source character set, then the |
- program is ill-formed. |
+ for a universal character name corresponds to a surrogate code point |
+ (in the range 0xD800-0xDFFF, inclusive), the program is ill-formed. |
+ Additionally, if the hexadecimal value for a universal-character-name |
+ outside a character or string literal corresponds to a control character |
+ (in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a |
+ character in the basic source character set, the program is ill-formed. |
+ |
+ C99 6.4.3: A universal character name shall not specify a character |
+ whose short identifier is less than 00A0 other than 0024 ($), 0040 (@), |
+ or 0060 (`), nor one in the range D800 through DFFF inclusive. |
*PSTR must be preceded by "\u" or "\U"; it is assumed that the |
buffer end is delimited by a non-hex digit. Returns zero if the |
@@ -1018,9 +1032,12 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, |
(int) (str - base), base); |
result = 1; |
} |
- /* The standard permits $, @ and ` to be specified as UCNs. We use |
- hex escapes so that this also works with EBCDIC hosts. */ |
+ /* The C99 standard permits $, @ and ` to be specified as UCNs. We use |
+ hex escapes so that this also works with EBCDIC hosts. |
+ C++0x permits everything below 0xa0 within literals; |
+ ucn_valid_in_identifier will complain about identifiers. */ |
else if ((result < 0xa0 |
+ && !CPP_OPTION (pfile, cplusplus) |
&& (result != 0x24 && result != 0x40 && result != 0x60)) |
|| (result & 0x80000000) |
|| (result >= 0xD800 && result <= 0xDFFF)) |
@@ -1301,7 +1318,7 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, |
unknown: |
if (ISGRAPH (c)) |
cpp_error (pfile, CPP_DL_PEDWARN, |
- "unknown escape sequence '\\%c'", (int) c); |
+ "unknown escape sequence: '\\%c'", (int) c); |
else |
{ |
/* diagnostic.c does not support "%03o". When it does, this |
@@ -1330,6 +1347,8 @@ converter_for_type (cpp_reader *pfile, enum cpp_ttype type) |
{ |
default: |
return pfile->narrow_cset_desc; |
+ case CPP_UTF8STRING: |
+ return pfile->utf8_cset_desc; |
case CPP_CHAR16: |
case CPP_STRING16: |
return pfile->char16_cset_desc; |
@@ -1364,7 +1383,34 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count, |
for (i = 0; i < count; i++) |
{ |
p = from[i].text; |
- if (*p == 'L' || *p == 'u' || *p == 'U') p++; |
+ if (*p == 'u') |
+ { |
+ if (*++p == '8') |
+ p++; |
+ } |
+ else if (*p == 'L' || *p == 'U') p++; |
+ if (*p == 'R') |
+ { |
+ const uchar *prefix; |
+ |
+ /* Skip over 'R"'. */ |
+ p += 2; |
+ prefix = p; |
+ while (*p != '(') |
+ p++; |
+ p++; |
+ limit = from[i].text + from[i].len; |
+ if (limit >= p + (p - prefix) + 1) |
+ limit -= (p - prefix) + 1; |
+ |
+ /* Raw strings are all normal characters; these can be fed |
+ directly to convert_cset. */ |
+ if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf)) |
+ goto fail; |
+ |
+ continue; |
+ } |
+ |
p++; /* Skip leading quote. */ |
limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */ |