1999-08-16 21:50:52 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
1999-12-13 22:28:37 +00:00
|
|
|
*
|
|
|
|
* Copyright (C) 1998-1999, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
1999-08-16 21:50:52 +00:00
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* File read.c
|
|
|
|
*
|
|
|
|
* Modification History:
|
|
|
|
*
|
|
|
|
* Date Name Description
|
|
|
|
* 05/26/99 stephen Creation.
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "read.h"
|
|
|
|
#include "error.h"
|
2000-01-28 22:40:27 +00:00
|
|
|
#include "unicode/ustdio.h"
|
2000-05-15 18:39:17 +00:00
|
|
|
#include "unicode/ustring.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
#define OPENBRACE 0x007B
|
|
|
|
#define CLOSEBRACE 0x007D
|
|
|
|
#define COMMA 0x002C
|
|
|
|
#define QUOTE 0x0022
|
|
|
|
#define ESCAPE 0x005C
|
|
|
|
#define SLASH 0x002F
|
|
|
|
#define ASTERISK 0x002A
|
|
|
|
#define SPACE 0x0020
|
2000-05-15 18:39:17 +00:00
|
|
|
#define COLON 0x003A
|
2000-05-18 21:25:51 +00:00
|
|
|
#define BADBOM 0xFFFE
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-05-15 18:39:17 +00:00
|
|
|
U_STRING_DECL(k_start_string, "string", 6);
|
|
|
|
U_STRING_DECL(k_start_binary, "binary", 6);
|
|
|
|
U_STRING_DECL(k_start_table, "table", 5);
|
|
|
|
U_STRING_DECL(k_start_int, "int", 3);
|
|
|
|
U_STRING_DECL(k_start_array, "array", 5);
|
|
|
|
U_STRING_DECL(k_start_intvector, "intvector", 9);
|
|
|
|
U_STRING_DECL(k_start_reserved, "reserved", 8);
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
static UBool didInit=FALSE;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-20 19:56:52 +00:00
|
|
|
extern int32_t lineCount;
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
/* Protos */
|
|
|
|
static enum ETokenType getStringToken(UFILE *f, UChar initialChar,
|
2000-05-18 21:25:51 +00:00
|
|
|
struct UString *token,
|
|
|
|
UErrorCode *status);
|
1999-08-16 21:50:52 +00:00
|
|
|
static UChar unescape(UFILE *f, UErrorCode *status);
|
2000-05-18 22:08:39 +00:00
|
|
|
static UChar getNextChar(UFILE *f, UBool skipwhite, UErrorCode *status);
|
1999-08-16 21:50:52 +00:00
|
|
|
static void seekUntilNewline(UFILE *f, UErrorCode *status);
|
|
|
|
static void seekUntilEndOfComment(UFILE *f, UErrorCode *status);
|
2000-05-18 22:08:39 +00:00
|
|
|
static UBool isWhitespace(UChar c);
|
|
|
|
static UBool isNewline(UChar c);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
/* Read and return the next token from the stream. If the token is of
|
|
|
|
type eString, fill in the token parameter with the token. If the
|
|
|
|
token is eError, then the status parameter will contain the
|
|
|
|
specific error. This will be eItemNotFound at the end of file,
|
|
|
|
indicating that all tokens have been returned. This method will
|
|
|
|
never return eString twice in a row; instead, multiple adjacent
|
|
|
|
string tokens will be merged into one, with no intervening
|
|
|
|
space. */
|
|
|
|
enum ETokenType getNextToken(UFILE *f,
|
2000-05-18 21:25:51 +00:00
|
|
|
struct UString *token,
|
|
|
|
UErrorCode *status)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-05-15 18:39:17 +00:00
|
|
|
UChar c;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-05-18 21:25:51 +00:00
|
|
|
/*enum ETokenType tokenType;*/
|
2000-05-15 18:39:17 +00:00
|
|
|
|
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
|
|
|
|
|
|
|
/* Skip whitespace */
|
|
|
|
c = getNextChar(f, TRUE, status);
|
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
|
|
|
|
|
|
|
switch(c) {
|
2000-05-18 21:25:51 +00:00
|
|
|
case BADBOM: return tok_error;
|
2000-05-15 18:39:17 +00:00
|
|
|
case OPENBRACE: return tok_open_brace;
|
|
|
|
case CLOSEBRACE: return tok_close_brace;
|
|
|
|
case COMMA: return tok_comma;
|
|
|
|
case U_EOF: return tok_EOF;
|
2000-05-18 21:25:51 +00:00
|
|
|
/*
|
|
|
|
case COLON: return tok_colon;
|
2000-05-15 18:39:17 +00:00
|
|
|
c = getNextChar(f, TRUE, status);
|
|
|
|
tokenType = getStringToken(f, c, token, status);
|
|
|
|
break;
|
2000-05-18 21:25:51 +00:00
|
|
|
*/
|
2000-05-15 18:39:17 +00:00
|
|
|
default: return getStringToken(f, c, token, status);
|
|
|
|
}
|
|
|
|
if(!didInit) {
|
|
|
|
U_STRING_INIT(k_start_string, "string", 6);
|
|
|
|
U_STRING_INIT(k_start_binary, "binary", 6);
|
|
|
|
U_STRING_INIT(k_start_table, "table", 5);
|
|
|
|
U_STRING_INIT(k_start_int, "int", 3);
|
|
|
|
U_STRING_INIT(k_start_array, "array", 5);
|
|
|
|
U_STRING_INIT(k_start_intvector, "intvector", 9);
|
|
|
|
U_STRING_INIT(k_start_reserved, "reserved", 8);
|
|
|
|
didInit=TRUE;
|
|
|
|
}
|
|
|
|
if(u_strcmp(token->fChars, k_start_string) == 0) {
|
|
|
|
return(tok_start_string);
|
|
|
|
} else if(u_strcmp(token->fChars, k_start_binary) == 0) {
|
|
|
|
return(tok_start_binary);
|
|
|
|
} else if(u_strcmp(token->fChars, k_start_table) == 0) {
|
|
|
|
return(tok_start_table);
|
|
|
|
} else if(u_strcmp(token->fChars, k_start_int) == 0) {
|
|
|
|
return(tok_start_int);
|
|
|
|
} else if(u_strcmp(token->fChars, k_start_array) == 0) {
|
|
|
|
return(tok_start_array);
|
|
|
|
} else if(u_strcmp(token->fChars, k_start_intvector) == 0) {
|
|
|
|
return(tok_start_intvector);
|
|
|
|
} else if(u_strcmp(token->fChars, k_start_reserved) == 0) {
|
|
|
|
return(tok_start_reserved);
|
|
|
|
} else {
|
|
|
|
return tok_error;
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Copy a string token into the given UnicodeString. Upon entry, we
|
|
|
|
have already read the first character of the string token, which is
|
|
|
|
not a whitespace character (but may be a QUOTE or ESCAPE). This
|
|
|
|
function reads all subsequent characters that belong with this
|
|
|
|
string, and copy them into the token parameter. The other
|
|
|
|
important, and slightly convoluted purpose of this function is to
|
|
|
|
merge adjacent strings. It looks forward a bit, and if the next
|
|
|
|
non comment, non whitespace item is a string, it reads it in as
|
|
|
|
well. If two adjacent strings are quoted, they are merged without
|
|
|
|
intervening space. Otherwise a single SPACE character is
|
|
|
|
inserted. */
|
|
|
|
static enum ETokenType getStringToken(UFILE *f,
|
|
|
|
UChar initialChar,
|
|
|
|
struct UString *token,
|
|
|
|
UErrorCode *status)
|
|
|
|
{
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool lastStringWasQuoted;
|
1999-08-16 21:50:52 +00:00
|
|
|
UChar c;
|
|
|
|
|
|
|
|
/* We are guaranteed on entry that initialChar is not a whitespace
|
|
|
|
character. If we are at the EOF, or have some other problem, it
|
|
|
|
doesn't matter; we still want to validly return the initialChar
|
|
|
|
(if nothing else) as a string token. */
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
/* setup */
|
|
|
|
lastStringWasQuoted = FALSE;
|
|
|
|
c = initialChar;
|
|
|
|
ustr_setlen(token, 0, status);
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
for(;;) {
|
|
|
|
if(c == QUOTE) {
|
|
|
|
if( ! lastStringWasQuoted && token->fLength > 0) {
|
|
|
|
ustr_ucat(token, SPACE, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
lastStringWasQuoted = TRUE;
|
|
|
|
|
|
|
|
for(;;) {
|
2000-01-28 22:40:27 +00:00
|
|
|
c = u_fgetc(f);
|
|
|
|
/* c = u_fgetc(f, status);*/
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
/* EOF reached */
|
2000-06-01 17:05:12 +00:00
|
|
|
if(c == (UChar)U_EOF) {
|
|
|
|
return tok_EOF;
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
/* Unterminated quoted strings */
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
if(c == QUOTE)
|
|
|
|
break;
|
|
|
|
if(c == ESCAPE)
|
|
|
|
c = unescape(f, status);
|
|
|
|
ustr_ucat(token, c, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if(token->fLength > 0) {
|
|
|
|
ustr_ucat(token, SPACE, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
lastStringWasQuoted = FALSE;
|
|
|
|
|
|
|
|
if(c == ESCAPE)
|
|
|
|
c = unescape(f, status);
|
|
|
|
ustr_ucat(token, c, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
for(;;) {
|
|
|
|
/* DON'T skip whitespace */
|
|
|
|
c = getNextChar(f, FALSE, status);
|
2000-07-20 19:56:52 +00:00
|
|
|
/* EOF reached */
|
|
|
|
if(c == (UChar)U_EOF) {
|
|
|
|
u_fungetc(c, f);
|
|
|
|
return tok_string;
|
|
|
|
}
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status))
|
1999-08-16 21:50:52 +00:00
|
|
|
return tok_string;
|
|
|
|
|
|
|
|
if(c == QUOTE
|
|
|
|
|| c == OPENBRACE
|
|
|
|
|| c == CLOSEBRACE
|
2000-05-15 18:39:17 +00:00
|
|
|
|| c == COMMA
|
2000-05-18 21:25:51 +00:00
|
|
|
/*|| c == COLON*/)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-01-28 22:40:27 +00:00
|
|
|
u_fungetc(c, f);
|
|
|
|
/*u_fungetc(c, f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(isWhitespace(c))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if(c == ESCAPE)
|
|
|
|
c = unescape(f, status);
|
|
|
|
ustr_ucat(token, c, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return tok_error;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* DO skip whitespace */
|
|
|
|
c = getNextChar(f, TRUE, status);
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status))
|
1999-08-16 21:50:52 +00:00
|
|
|
return tok_string;
|
|
|
|
|
2000-05-18 21:25:51 +00:00
|
|
|
if(c == OPENBRACE || c == CLOSEBRACE || c == COMMA/* || c == COLON*/) {
|
2000-01-28 22:40:27 +00:00
|
|
|
u_fungetc(c, f);
|
|
|
|
/*u_fungetc(c, f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
return tok_string;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Retrieve the next character, ignoring comments. If skipwhite is
|
|
|
|
true, whitespace is skipped as well. */
|
|
|
|
static UChar getNextChar(UFILE *f,
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool skipwhite,
|
1999-08-16 21:50:52 +00:00
|
|
|
UErrorCode *status)
|
|
|
|
{
|
|
|
|
UChar c;
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return U_EOF;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
for(;;) {
|
2000-01-28 22:40:27 +00:00
|
|
|
c = u_fgetc(f);
|
|
|
|
/*c = u_fgetc(f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
if(c == (UChar)U_EOF) return U_EOF;
|
|
|
|
|
|
|
|
if(skipwhite && isWhitespace(c))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* This also handles the get() failing case */
|
|
|
|
if(c != SLASH)
|
|
|
|
return c;
|
|
|
|
|
2000-01-28 22:40:27 +00:00
|
|
|
c = u_fgetc(f);
|
|
|
|
/* c = u_fgetc(f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
if(c == (UChar)U_EOF) return U_EOF;
|
|
|
|
|
|
|
|
switch(c) {
|
|
|
|
case SLASH:
|
|
|
|
seekUntilNewline(f, status);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ASTERISK:
|
|
|
|
/* Note that we silently ignore an unterminated comment */
|
|
|
|
seekUntilEndOfComment(f, status);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2000-01-28 22:40:27 +00:00
|
|
|
u_fungetc(c, f);
|
|
|
|
/*u_fungetc(c, f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
/* If get() failed this is a NOP */
|
|
|
|
return SLASH;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void seekUntilNewline(UFILE *f,
|
|
|
|
UErrorCode *status)
|
|
|
|
{
|
|
|
|
UChar c;
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
do {
|
2000-01-28 22:40:27 +00:00
|
|
|
c = u_fgetc(f);
|
|
|
|
/* c = u_fgetc(f, status);*/
|
1999-10-07 00:07:53 +00:00
|
|
|
} while(! isNewline(c) && c != (UChar)U_EOF && *status == U_ZERO_ERROR);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
/*if(U_FAILURE(*status))
|
1999-08-16 21:50:52 +00:00
|
|
|
err = kItemNotFound;*/
|
|
|
|
}
|
|
|
|
|
|
|
|
void seekUntilEndOfComment(UFILE *f,
|
|
|
|
UErrorCode *status)
|
|
|
|
{
|
|
|
|
UChar c, d;
|
|
|
|
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
do {
|
2000-01-28 22:40:27 +00:00
|
|
|
c = u_fgetc(f);
|
|
|
|
/* c = u_fgetc(f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
if(c == ASTERISK) {
|
2000-01-28 22:40:27 +00:00
|
|
|
d = u_fgetc(f);
|
|
|
|
/* d = u_fgetc(f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
if(d != SLASH)
|
2000-01-28 22:40:27 +00:00
|
|
|
u_fungetc(d, f);
|
|
|
|
/*u_fungetc(d, f, status);*/
|
1999-08-16 21:50:52 +00:00
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
1999-10-07 00:07:53 +00:00
|
|
|
} while(c != (UChar)U_EOF && *status == U_ZERO_ERROR);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
if(c == (UChar)U_EOF) {
|
1999-10-07 00:07:53 +00:00
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
1999-08-16 21:50:52 +00:00
|
|
|
setErrorText("Unterminated comment detected");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static UChar unescape(UFILE *f,
|
|
|
|
UErrorCode *status)
|
|
|
|
{
|
1999-10-18 22:48:32 +00:00
|
|
|
if(U_FAILURE(*status)) return U_EOF;
|
2000-07-16 13:43:40 +00:00
|
|
|
/* We expect to be called after the ESCAPE has been seen, but
|
|
|
|
* u_fgetcx needs an ESCAPE to do its magic. */
|
|
|
|
u_fungetc(ESCAPE, f);
|
|
|
|
return (UChar) u_fgetcx(f);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
static UBool isWhitespace(UChar c)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
|
|
|
switch (c) {
|
2000-05-18 21:25:51 +00:00
|
|
|
/* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
|
2000-07-20 19:56:52 +00:00
|
|
|
case 0x000A: case 0x2029:
|
|
|
|
lineCount++;
|
|
|
|
case 0x000D: case 0x0020: case 0x0009: case 0xFEFF:
|
1999-08-16 21:50:52 +00:00
|
|
|
return TRUE;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
static UBool isNewline(UChar c)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
|
|
|
switch (c) {
|
|
|
|
/* '\n', '\r', 0x2029 */
|
2000-07-20 19:56:52 +00:00
|
|
|
case 0x000A: case 0x2029:
|
|
|
|
lineCount++;
|
|
|
|
case 0x000D:
|
1999-08-16 21:50:52 +00:00
|
|
|
return TRUE;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|