| Index: icu46/source/tools/genrb/read.c
|
| ===================================================================
|
| --- icu46/source/tools/genrb/read.c (revision 0)
|
| +++ icu46/source/tools/genrb/read.c (revision 0)
|
| @@ -0,0 +1,473 @@
|
| +/*
|
| +*******************************************************************************
|
| +*
|
| +* Copyright (C) 1998-2009, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*
|
| +*******************************************************************************
|
| +*
|
| +* File read.c
|
| +*
|
| +* Modification History:
|
| +*
|
| +* Date Name Description
|
| +* 05/26/99 stephen Creation.
|
| +* 5/10/01 Ram removed ustdio dependency
|
| +*******************************************************************************
|
| +*/
|
| +
|
| +#include "read.h"
|
| +#include "errmsg.h"
|
| +#include "unicode/ustring.h"
|
| +
|
| +#define OPENBRACE 0x007B
|
| +#define CLOSEBRACE 0x007D
|
| +#define COMMA 0x002C
|
| +#define QUOTE 0x0022
|
| +#define ESCAPE 0x005C
|
| +#define SLASH 0x002F
|
| +#define ASTERISK 0x002A
|
| +#define SPACE 0x0020
|
| +#define COLON 0x003A
|
| +#define BADBOM 0xFFFE
|
| +#define CR 0x000D
|
| +#define LF 0x000A
|
| +
|
| +static int32_t lineCount;
|
| +
|
| +/* Protos */
|
| +static enum ETokenType getStringToken(UCHARBUF *buf,
|
| + UChar32 initialChar,
|
| + struct UString *token,
|
| + UErrorCode *status);
|
| +
|
| +static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
|
| +static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
|
| +static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
|
| +static UBool isWhitespace (UChar32 c);
|
| +static UBool isNewline (UChar32 c);
|
| +
|
| +void resetLineNumber() {
|
| + lineCount = 1;
|
| +}
|
| +
|
| +/* Read and return the next token from the stream. If the token is of
|
| + type eString, fill in the token parameter with the token. If the
|
| + token is eError, then the status parameter will contain the
|
| + specific error. This will be eItemNotFound at the end of file,
|
| + indicating that all tokens have been returned. This method will
|
| + never return eString twice in a row; instead, multiple adjacent
|
| + string tokens will be merged into one, with no intervening
|
| + space. */
|
| +enum ETokenType getNextToken(UCHARBUF* buf,
|
| + struct UString *token,
|
| + uint32_t *linenumber, /* out: linenumber of token */
|
| + struct UString *comment,
|
| + UErrorCode *status) {
|
| + enum ETokenType result;
|
| + UChar32 c;
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| +
|
| + /* Skip whitespace */
|
| + c = getNextChar(buf, TRUE, comment, status);
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| +
|
| + *linenumber = lineCount;
|
| +
|
| + switch(c) {
|
| + case BADBOM:
|
| + return TOK_ERROR;
|
| + case OPENBRACE:
|
| + return TOK_OPEN_BRACE;
|
| + case CLOSEBRACE:
|
| + return TOK_CLOSE_BRACE;
|
| + case COMMA:
|
| + return TOK_COMMA;
|
| + case U_EOF:
|
| + return TOK_EOF;
|
| + case COLON:
|
| + return TOK_COLON;
|
| +
|
| + default:
|
| + result = getStringToken(buf, c, token, status);
|
| + }
|
| +
|
| + *linenumber = lineCount;
|
| + return result;
|
| +}
|
| +
|
| +/* Copy a string token into the given UnicodeString. Upon entry, we
|
| + have already read the first character of the string token, which is
|
| + not a whitespace character (but may be a QUOTE or ESCAPE). This
|
| + function reads all subsequent characters that belong with this
|
| + string, and copy them into the token parameter. The other
|
| + important, and slightly convoluted purpose of this function is to
|
| + merge adjacent strings. It looks forward a bit, and if the next
|
| + non comment, non whitespace item is a string, it reads it in as
|
| + well. If two adjacent strings are quoted, they are merged without
|
| + intervening space. Otherwise a single SPACE character is
|
| + inserted. */
|
| +static enum ETokenType getStringToken(UCHARBUF* buf,
|
| + UChar32 initialChar,
|
| + struct UString *token,
|
| + UErrorCode *status) {
|
| + UBool lastStringWasQuoted;
|
| + UChar32 c;
|
| + UChar target[3] = { '\0' };
|
| + UChar *pTarget = target;
|
| + int len=0;
|
| + UBool isFollowingCharEscaped=FALSE;
|
| + UBool isNLUnescaped = FALSE;
|
| + UChar32 prevC=0;
|
| +
|
| + /* We are guaranteed on entry that initialChar is not a whitespace
|
| + character. If we are at the EOF, or have some other problem, it
|
| + doesn't matter; we still want to validly return the initialChar
|
| + (if nothing else) as a string token. */
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| +
|
| + /* setup */
|
| + lastStringWasQuoted = FALSE;
|
| + c = initialChar;
|
| + ustr_setlen(token, 0, status);
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| +
|
| + for (;;) {
|
| + if (c == QUOTE) {
|
| + if (!lastStringWasQuoted && token->fLength > 0) {
|
| + ustr_ucat(token, SPACE, status);
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| + }
|
| +
|
| + lastStringWasQuoted = TRUE;
|
| +
|
| + for (;;) {
|
| + c = ucbuf_getc(buf,status);
|
| +
|
| + /* EOF reached */
|
| + if (c == U_EOF) {
|
| + return TOK_EOF;
|
| + }
|
| +
|
| + /* Unterminated quoted strings */
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| +
|
| + if (c == QUOTE && !isFollowingCharEscaped) {
|
| + break;
|
| + }
|
| +
|
| + if (c == ESCAPE && !isFollowingCharEscaped) {
|
| + pTarget = target;
|
| + c = unescape(buf, status);
|
| +
|
| + if (c == U_ERR) {
|
| + return TOK_ERROR;
|
| + }
|
| + if(c == CR || c == LF){
|
| + isNLUnescaped = TRUE;
|
| + }
|
| + }
|
| +
|
| + if(c==ESCAPE && !isFollowingCharEscaped){
|
| + isFollowingCharEscaped = TRUE;
|
| + }else{
|
| + U_APPEND_CHAR32(c, pTarget,len);
|
| + pTarget = target;
|
| + ustr_uscat(token, pTarget,len, status);
|
| + isFollowingCharEscaped = FALSE;
|
| + len=0;
|
| + if(c == CR || c == LF){
|
| + if(isNLUnescaped == FALSE && prevC!=CR){
|
| + lineCount++;
|
| + }
|
| + isNLUnescaped = FALSE;
|
| + }
|
| + }
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| + prevC = c;
|
| + }
|
| + } else {
|
| + if (token->fLength > 0) {
|
| + ustr_ucat(token, SPACE, status);
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| + }
|
| +
|
| + if(lastStringWasQuoted){
|
| + if(getShowWarning()){
|
| + warning(lineCount, "Mixing quoted and unquoted strings");
|
| + }
|
| + if(isStrict()){
|
| + return TOK_ERROR;
|
| + }
|
| +
|
| + }
|
| +
|
| + lastStringWasQuoted = FALSE;
|
| +
|
| + /* if we reach here we are mixing
|
| + * quoted and unquoted strings
|
| + * warn in normal mode and error in
|
| + * pedantic mode
|
| + */
|
| +
|
| + if (c == ESCAPE) {
|
| + pTarget = target;
|
| + c = unescape(buf, status);
|
| +
|
| + /* EOF reached */
|
| + if (c == U_EOF) {
|
| + return TOK_ERROR;
|
| + }
|
| + }
|
| +
|
| + U_APPEND_CHAR32(c, pTarget,len);
|
| + pTarget = target;
|
| + ustr_uscat(token, pTarget,len, status);
|
| + len=0;
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| +
|
| + for (;;) {
|
| + /* DON'T skip whitespace */
|
| + c = getNextChar(buf, FALSE, NULL, status);
|
| +
|
| + /* EOF reached */
|
| + if (c == U_EOF) {
|
| + ucbuf_ungetc(c, buf);
|
| + return TOK_STRING;
|
| + }
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_STRING;
|
| + }
|
| +
|
| + if (c == QUOTE
|
| + || c == OPENBRACE
|
| + || c == CLOSEBRACE
|
| + || c == COMMA
|
| + || c == COLON) {
|
| + ucbuf_ungetc(c, buf);
|
| + break;
|
| + }
|
| +
|
| + if (isWhitespace(c)) {
|
| + break;
|
| + }
|
| +
|
| + if (c == ESCAPE) {
|
| + pTarget = target;
|
| + c = unescape(buf, status);
|
| +
|
| + if (c == U_ERR) {
|
| + return TOK_ERROR;
|
| + }
|
| + }
|
| +
|
| + U_APPEND_CHAR32(c, pTarget,len);
|
| + pTarget = target;
|
| + ustr_uscat(token, pTarget,len, status);
|
| + len=0;
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_ERROR;
|
| + }
|
| + }
|
| + }
|
| +
|
| + /* DO skip whitespace */
|
| + c = getNextChar(buf, TRUE, NULL, status);
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return TOK_STRING;
|
| + }
|
| +
|
| + if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
|
| + ucbuf_ungetc(c, buf);
|
| + return TOK_STRING;
|
| + }
|
| + }
|
| +}
|
| +
|
| +/* Retrieve the next character. If skipwhite is
|
| + true, whitespace is skipped as well. */
|
| +static UChar32 getNextChar(UCHARBUF* buf,
|
| + UBool skipwhite,
|
| + struct UString *token,
|
| + UErrorCode *status) {
|
| + UChar32 c, c2;
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return U_EOF;
|
| + }
|
| +
|
| + for (;;) {
|
| + c = ucbuf_getc(buf,status);
|
| +
|
| + if (c == U_EOF) {
|
| + return U_EOF;
|
| + }
|
| +
|
| + if (skipwhite && isWhitespace(c)) {
|
| + continue;
|
| + }
|
| +
|
| + /* This also handles the get() failing case */
|
| + if (c != SLASH) {
|
| + return c;
|
| + }
|
| +
|
| + c = ucbuf_getc(buf,status); /* "/c" */
|
| +
|
| + if (c == U_EOF) {
|
| + return U_EOF;
|
| + }
|
| +
|
| + switch (c) {
|
| + case SLASH: /* "//" */
|
| + seekUntilNewline(buf, NULL, status);
|
| + break;
|
| +
|
| + case ASTERISK: /* " / * " */
|
| + c2 = ucbuf_getc(buf, status); /* "/ * c" */
|
| + if(c2 == ASTERISK){ /* "/ * *" */
|
| + /* parse multi-line comment and store it in token*/
|
| + seekUntilEndOfComment(buf, token, status);
|
| + } else {
|
| + ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
|
| + seekUntilEndOfComment(buf, NULL, status);
|
| + }
|
| + break;
|
| +
|
| + default:
|
| + ucbuf_ungetc(c, buf); /* "/c" - put back the c */
|
| + /* If get() failed this is a NOP */
|
| + return SLASH;
|
| + }
|
| +
|
| + }
|
| +}
|
| +
|
| +static void seekUntilNewline(UCHARBUF* buf,
|
| + struct UString *token,
|
| + UErrorCode *status) {
|
| + UChar32 c;
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return;
|
| + }
|
| +
|
| + do {
|
| + c = ucbuf_getc(buf,status);
|
| + /* add the char to token */
|
| + if(token!=NULL){
|
| + ustr_u32cat(token, c, status);
|
| + }
|
| + } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
|
| +}
|
| +
|
| +static void seekUntilEndOfComment(UCHARBUF *buf,
|
| + struct UString *token,
|
| + UErrorCode *status) {
|
| + UChar32 c, d;
|
| + uint32_t line;
|
| +
|
| + if (U_FAILURE(*status)) {
|
| + return;
|
| + }
|
| +
|
| + line = lineCount;
|
| +
|
| + do {
|
| + c = ucbuf_getc(buf, status);
|
| +
|
| + if (c == ASTERISK) {
|
| + d = ucbuf_getc(buf, status);
|
| +
|
| + if (d != SLASH) {
|
| + ucbuf_ungetc(d, buf);
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + /* add the char to token */
|
| + if(token!=NULL){
|
| + ustr_u32cat(token, c, status);
|
| + }
|
| + /* increment the lineCount */
|
| + isNewline(c);
|
| +
|
| + } while (c != U_EOF && *status == U_ZERO_ERROR);
|
| +
|
| + if (c == U_EOF) {
|
| + *status = U_INVALID_FORMAT_ERROR;
|
| + error(line, "unterminated comment detected");
|
| + }
|
| +}
|
| +
|
| +UChar32 unescape(UCHARBUF *buf,
|
| + UErrorCode *status) {
|
| + if (U_FAILURE(*status)) {
|
| + return U_EOF;
|
| + }
|
| +
|
| + /* We expect to be called after the ESCAPE has been seen, but
|
| + * u_fgetcx needs an ESCAPE to do its magic. */
|
| + ucbuf_ungetc(ESCAPE, buf);
|
| +
|
| + return ucbuf_getcx32(buf, status);
|
| +}
|
| +
|
| +static UBool isWhitespace(UChar32 c) {
|
| + switch (c) {
|
| + /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
|
| + case 0x000A:
|
| + case 0x2029:
|
| + lineCount++;
|
| + case 0x000D:
|
| + case 0x0020:
|
| + case 0x0009:
|
| + case 0xFEFF:
|
| + return TRUE;
|
| +
|
| + default:
|
| + return FALSE;
|
| + }
|
| +}
|
| +
|
| +static UBool isNewline(UChar32 c) {
|
| + switch (c) {
|
| + /* '\n', '\r', 0x2029 */
|
| + case 0x000A:
|
| + case 0x2029:
|
| + lineCount++;
|
| + case 0x000D:
|
| + return TRUE;
|
| +
|
| + default:
|
| + return FALSE;
|
| + }
|
| +}
|
|
|
| Property changes on: icu46/source/tools/genrb/read.c
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|