OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 1998-2009, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * |
| 9 * File read.c |
| 10 * |
| 11 * Modification History: |
| 12 * |
| 13 * Date Name Description |
| 14 * 05/26/99 stephen Creation. |
| 15 * 5/10/01 Ram removed ustdio dependency |
| 16 ******************************************************************************* |
| 17 */ |
| 18 |
| 19 #include "read.h" |
| 20 #include "errmsg.h" |
| 21 #include "unicode/ustring.h" |
| 22 |
| 23 #define OPENBRACE 0x007B |
| 24 #define CLOSEBRACE 0x007D |
| 25 #define COMMA 0x002C |
| 26 #define QUOTE 0x0022 |
| 27 #define ESCAPE 0x005C |
| 28 #define SLASH 0x002F |
| 29 #define ASTERISK 0x002A |
| 30 #define SPACE 0x0020 |
| 31 #define COLON 0x003A |
| 32 #define BADBOM 0xFFFE |
| 33 #define CR 0x000D |
| 34 #define LF 0x000A |
| 35 |
| 36 static int32_t lineCount; |
| 37 |
| 38 /* Protos */ |
| 39 static enum ETokenType getStringToken(UCHARBUF *buf, |
| 40 UChar32 initialChar, |
| 41 struct UString *token, |
| 42 UErrorCode *status); |
| 43 |
| 44 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct USt
ring *token, UErrorCode *status); |
| 45 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErr
orCode *status); |
| 46 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErr
orCode *status); |
| 47 static UBool isWhitespace (UChar32 c); |
| 48 static UBool isNewline (UChar32 c); |
| 49 |
| 50 void resetLineNumber() { |
| 51 lineCount = 1; |
| 52 } |
| 53 |
| 54 /* Read and return the next token from the stream. If the token is of |
| 55 type eString, fill in the token parameter with the token. If the |
| 56 token is eError, then the status parameter will contain the |
| 57 specific error. This will be eItemNotFound at the end of file, |
| 58 indicating that all tokens have been returned. This method will |
| 59 never return eString twice in a row; instead, multiple adjacent |
| 60 string tokens will be merged into one, with no intervening |
| 61 space. */ |
| 62 enum ETokenType getNextToken(UCHARBUF* buf, |
| 63 struct UString *token, |
| 64 uint32_t *linenumber, /* out: linenumber of token *
/ |
| 65 struct UString *comment, |
| 66 UErrorCode *status) { |
| 67 enum ETokenType result; |
| 68 UChar32 c; |
| 69 |
| 70 if (U_FAILURE(*status)) { |
| 71 return TOK_ERROR; |
| 72 } |
| 73 |
| 74 /* Skip whitespace */ |
| 75 c = getNextChar(buf, TRUE, comment, status); |
| 76 |
| 77 if (U_FAILURE(*status)) { |
| 78 return TOK_ERROR; |
| 79 } |
| 80 |
| 81 *linenumber = lineCount; |
| 82 |
| 83 switch(c) { |
| 84 case BADBOM: |
| 85 return TOK_ERROR; |
| 86 case OPENBRACE: |
| 87 return TOK_OPEN_BRACE; |
| 88 case CLOSEBRACE: |
| 89 return TOK_CLOSE_BRACE; |
| 90 case COMMA: |
| 91 return TOK_COMMA; |
| 92 case U_EOF: |
| 93 return TOK_EOF; |
| 94 case COLON: |
| 95 return TOK_COLON; |
| 96 |
| 97 default: |
| 98 result = getStringToken(buf, c, token, status); |
| 99 } |
| 100 |
| 101 *linenumber = lineCount; |
| 102 return result; |
| 103 } |
| 104 |
| 105 /* Copy a string token into the given UnicodeString. Upon entry, we |
| 106 have already read the first character of the string token, which is |
| 107 not a whitespace character (but may be a QUOTE or ESCAPE). This |
| 108 function reads all subsequent characters that belong with this |
| 109 string, and copy them into the token parameter. The other |
| 110 important, and slightly convoluted purpose of this function is to |
| 111 merge adjacent strings. It looks forward a bit, and if the next |
| 112 non comment, non whitespace item is a string, it reads it in as |
| 113 well. If two adjacent strings are quoted, they are merged without |
| 114 intervening space. Otherwise a single SPACE character is |
| 115 inserted. */ |
| 116 static enum ETokenType getStringToken(UCHARBUF* buf, |
| 117 UChar32 initialChar, |
| 118 struct UString *token, |
| 119 UErrorCode *status) { |
| 120 UBool lastStringWasQuoted; |
| 121 UChar32 c; |
| 122 UChar target[3] = { '\0' }; |
| 123 UChar *pTarget = target; |
| 124 int len=0; |
| 125 UBool isFollowingCharEscaped=FALSE; |
| 126 UBool isNLUnescaped = FALSE; |
| 127 UChar32 prevC=0; |
| 128 |
| 129 /* We are guaranteed on entry that initialChar is not a whitespace |
| 130 character. If we are at the EOF, or have some other problem, it |
| 131 doesn't matter; we still want to validly return the initialChar |
| 132 (if nothing else) as a string token. */ |
| 133 |
| 134 if (U_FAILURE(*status)) { |
| 135 return TOK_ERROR; |
| 136 } |
| 137 |
| 138 /* setup */ |
| 139 lastStringWasQuoted = FALSE; |
| 140 c = initialChar; |
| 141 ustr_setlen(token, 0, status); |
| 142 |
| 143 if (U_FAILURE(*status)) { |
| 144 return TOK_ERROR; |
| 145 } |
| 146 |
| 147 for (;;) { |
| 148 if (c == QUOTE) { |
| 149 if (!lastStringWasQuoted && token->fLength > 0) { |
| 150 ustr_ucat(token, SPACE, status); |
| 151 |
| 152 if (U_FAILURE(*status)) { |
| 153 return TOK_ERROR; |
| 154 } |
| 155 } |
| 156 |
| 157 lastStringWasQuoted = TRUE; |
| 158 |
| 159 for (;;) { |
| 160 c = ucbuf_getc(buf,status); |
| 161 |
| 162 /* EOF reached */ |
| 163 if (c == U_EOF) { |
| 164 return TOK_EOF; |
| 165 } |
| 166 |
| 167 /* Unterminated quoted strings */ |
| 168 if (U_FAILURE(*status)) { |
| 169 return TOK_ERROR; |
| 170 } |
| 171 |
| 172 if (c == QUOTE && !isFollowingCharEscaped) { |
| 173 break; |
| 174 } |
| 175 |
| 176 if (c == ESCAPE && !isFollowingCharEscaped) { |
| 177 pTarget = target; |
| 178 c = unescape(buf, status); |
| 179 |
| 180 if (c == U_ERR) { |
| 181 return TOK_ERROR; |
| 182 } |
| 183 if(c == CR || c == LF){ |
| 184 isNLUnescaped = TRUE; |
| 185 } |
| 186 } |
| 187 |
| 188 if(c==ESCAPE && !isFollowingCharEscaped){ |
| 189 isFollowingCharEscaped = TRUE; |
| 190 }else{ |
| 191 U_APPEND_CHAR32(c, pTarget,len); |
| 192 pTarget = target; |
| 193 ustr_uscat(token, pTarget,len, status); |
| 194 isFollowingCharEscaped = FALSE; |
| 195 len=0; |
| 196 if(c == CR || c == LF){ |
| 197 if(isNLUnescaped == FALSE && prevC!=CR){ |
| 198 lineCount++; |
| 199 } |
| 200 isNLUnescaped = FALSE; |
| 201 } |
| 202 } |
| 203 |
| 204 if (U_FAILURE(*status)) { |
| 205 return TOK_ERROR; |
| 206 } |
| 207 prevC = c; |
| 208 } |
| 209 } else { |
| 210 if (token->fLength > 0) { |
| 211 ustr_ucat(token, SPACE, status); |
| 212 |
| 213 if (U_FAILURE(*status)) { |
| 214 return TOK_ERROR; |
| 215 } |
| 216 } |
| 217 |
| 218 if(lastStringWasQuoted){ |
| 219 if(getShowWarning()){ |
| 220 warning(lineCount, "Mixing quoted and unquoted strings"); |
| 221 } |
| 222 if(isStrict()){ |
| 223 return TOK_ERROR; |
| 224 } |
| 225 |
| 226 } |
| 227 |
| 228 lastStringWasQuoted = FALSE; |
| 229 |
| 230 /* if we reach here we are mixing |
| 231 * quoted and unquoted strings |
| 232 * warn in normal mode and error in |
| 233 * pedantic mode |
| 234 */ |
| 235 |
| 236 if (c == ESCAPE) { |
| 237 pTarget = target; |
| 238 c = unescape(buf, status); |
| 239 |
| 240 /* EOF reached */ |
| 241 if (c == U_EOF) { |
| 242 return TOK_ERROR; |
| 243 } |
| 244 } |
| 245 |
| 246 U_APPEND_CHAR32(c, pTarget,len); |
| 247 pTarget = target; |
| 248 ustr_uscat(token, pTarget,len, status); |
| 249 len=0; |
| 250 |
| 251 if (U_FAILURE(*status)) { |
| 252 return TOK_ERROR; |
| 253 } |
| 254 |
| 255 for (;;) { |
| 256 /* DON'T skip whitespace */ |
| 257 c = getNextChar(buf, FALSE, NULL, status); |
| 258 |
| 259 /* EOF reached */ |
| 260 if (c == U_EOF) { |
| 261 ucbuf_ungetc(c, buf); |
| 262 return TOK_STRING; |
| 263 } |
| 264 |
| 265 if (U_FAILURE(*status)) { |
| 266 return TOK_STRING; |
| 267 } |
| 268 |
| 269 if (c == QUOTE |
| 270 || c == OPENBRACE |
| 271 || c == CLOSEBRACE |
| 272 || c == COMMA |
| 273 || c == COLON) { |
| 274 ucbuf_ungetc(c, buf); |
| 275 break; |
| 276 } |
| 277 |
| 278 if (isWhitespace(c)) { |
| 279 break; |
| 280 } |
| 281 |
| 282 if (c == ESCAPE) { |
| 283 pTarget = target; |
| 284 c = unescape(buf, status); |
| 285 |
| 286 if (c == U_ERR) { |
| 287 return TOK_ERROR; |
| 288 } |
| 289 } |
| 290 |
| 291 U_APPEND_CHAR32(c, pTarget,len); |
| 292 pTarget = target; |
| 293 ustr_uscat(token, pTarget,len, status); |
| 294 len=0; |
| 295 if (U_FAILURE(*status)) { |
| 296 return TOK_ERROR; |
| 297 } |
| 298 } |
| 299 } |
| 300 |
| 301 /* DO skip whitespace */ |
| 302 c = getNextChar(buf, TRUE, NULL, status); |
| 303 |
| 304 if (U_FAILURE(*status)) { |
| 305 return TOK_STRING; |
| 306 } |
| 307 |
| 308 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { |
| 309 ucbuf_ungetc(c, buf); |
| 310 return TOK_STRING; |
| 311 } |
| 312 } |
| 313 } |
| 314 |
| 315 /* Retrieve the next character. If skipwhite is |
| 316 true, whitespace is skipped as well. */ |
| 317 static UChar32 getNextChar(UCHARBUF* buf, |
| 318 UBool skipwhite, |
| 319 struct UString *token, |
| 320 UErrorCode *status) { |
| 321 UChar32 c, c2; |
| 322 |
| 323 if (U_FAILURE(*status)) { |
| 324 return U_EOF; |
| 325 } |
| 326 |
| 327 for (;;) { |
| 328 c = ucbuf_getc(buf,status); |
| 329 |
| 330 if (c == U_EOF) { |
| 331 return U_EOF; |
| 332 } |
| 333 |
| 334 if (skipwhite && isWhitespace(c)) { |
| 335 continue; |
| 336 } |
| 337 |
| 338 /* This also handles the get() failing case */ |
| 339 if (c != SLASH) { |
| 340 return c; |
| 341 } |
| 342 |
| 343 c = ucbuf_getc(buf,status); /* "/c" */ |
| 344 |
| 345 if (c == U_EOF) { |
| 346 return U_EOF; |
| 347 } |
| 348 |
| 349 switch (c) { |
| 350 case SLASH: /* "//" */ |
| 351 seekUntilNewline(buf, NULL, status); |
| 352 break; |
| 353 |
| 354 case ASTERISK: /* " / * " */ |
| 355 c2 = ucbuf_getc(buf, status); /* "/ * c" */ |
| 356 if(c2 == ASTERISK){ /* "/ * *" */ |
| 357 /* parse multi-line comment and store it in token*/ |
| 358 seekUntilEndOfComment(buf, token, status); |
| 359 } else { |
| 360 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *"
. Include c2 back in buffer. */ |
| 361 seekUntilEndOfComment(buf, NULL, status); |
| 362 } |
| 363 break; |
| 364 |
| 365 default: |
| 366 ucbuf_ungetc(c, buf); /* "/c" - put back the c */ |
| 367 /* If get() failed this is a NOP */ |
| 368 return SLASH; |
| 369 } |
| 370 |
| 371 } |
| 372 } |
| 373 |
| 374 static void seekUntilNewline(UCHARBUF* buf, |
| 375 struct UString *token, |
| 376 UErrorCode *status) { |
| 377 UChar32 c; |
| 378 |
| 379 if (U_FAILURE(*status)) { |
| 380 return; |
| 381 } |
| 382 |
| 383 do { |
| 384 c = ucbuf_getc(buf,status); |
| 385 /* add the char to token */ |
| 386 if(token!=NULL){ |
| 387 ustr_u32cat(token, c, status); |
| 388 } |
| 389 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); |
| 390 } |
| 391 |
| 392 static void seekUntilEndOfComment(UCHARBUF *buf, |
| 393 struct UString *token, |
| 394 UErrorCode *status) { |
| 395 UChar32 c, d; |
| 396 uint32_t line; |
| 397 |
| 398 if (U_FAILURE(*status)) { |
| 399 return; |
| 400 } |
| 401 |
| 402 line = lineCount; |
| 403 |
| 404 do { |
| 405 c = ucbuf_getc(buf, status); |
| 406 |
| 407 if (c == ASTERISK) { |
| 408 d = ucbuf_getc(buf, status); |
| 409 |
| 410 if (d != SLASH) { |
| 411 ucbuf_ungetc(d, buf); |
| 412 } else { |
| 413 break; |
| 414 } |
| 415 } |
| 416 /* add the char to token */ |
| 417 if(token!=NULL){ |
| 418 ustr_u32cat(token, c, status); |
| 419 } |
| 420 /* increment the lineCount */ |
| 421 isNewline(c); |
| 422 |
| 423 } while (c != U_EOF && *status == U_ZERO_ERROR); |
| 424 |
| 425 if (c == U_EOF) { |
| 426 *status = U_INVALID_FORMAT_ERROR; |
| 427 error(line, "unterminated comment detected"); |
| 428 } |
| 429 } |
| 430 |
| 431 UChar32 unescape(UCHARBUF *buf, |
| 432 UErrorCode *status) { |
| 433 if (U_FAILURE(*status)) { |
| 434 return U_EOF; |
| 435 } |
| 436 |
| 437 /* We expect to be called after the ESCAPE has been seen, but |
| 438 * u_fgetcx needs an ESCAPE to do its magic. */ |
| 439 ucbuf_ungetc(ESCAPE, buf); |
| 440 |
| 441 return ucbuf_getcx32(buf, status); |
| 442 } |
| 443 |
| 444 static UBool isWhitespace(UChar32 c) { |
| 445 switch (c) { |
| 446 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ |
| 447 case 0x000A: |
| 448 case 0x2029: |
| 449 lineCount++; |
| 450 case 0x000D: |
| 451 case 0x0020: |
| 452 case 0x0009: |
| 453 case 0xFEFF: |
| 454 return TRUE; |
| 455 |
| 456 default: |
| 457 return FALSE; |
| 458 } |
| 459 } |
| 460 |
| 461 static UBool isNewline(UChar32 c) { |
| 462 switch (c) { |
| 463 /* '\n', '\r', 0x2029 */ |
| 464 case 0x000A: |
| 465 case 0x2029: |
| 466 lineCount++; |
| 467 case 0x000D: |
| 468 return TRUE; |
| 469 |
| 470 default: |
| 471 return FALSE; |
| 472 } |
| 473 } |
OLD | NEW |