glibc/locale/programs/linereader.c
Ulrich Drepper 4b10dd6c19 Update.
* locale/Makefile (distribute): Add iso-639.def and iso-3166.def.
	Change charset.h to charmap.h.
	(categories): Add new categories.  Leave out collate for now.
	Update build rules.
	* locale/categories.def: Add definitions for new categories.
	* locale/langinfo.h: Likewise.
	* locale/locale.h: Likewise.
	* locale/C-address.c: New file.
	* locale/C-identification.c: New file.
	* locale/C-measurement.c: New file.
	* locale/C-name.c: New file.
	* locale/C-paper.c: New file.
	* locale/C-telephone.c: New file.
	* locale/lc-address.c: Likewise.
	* locale/lc-identification.c: Likewise.
	* locale/lc-measurement.c: Likewise.
	* locale/lc-name.c: Likewise.
	* locale/lc-paper.c: Likewise.
	* locale/lc-telephone.c: Likewise.
	* locale/C-ctype.c: Update for locale rewrite.
	* locale/C-messages.c: Likewise.
	* locale/C-monetary.c: Likewise.
	* locale/C-time.c: Likewise.
	* locale/lc-collate.c: Likewise.
	* locale/lc-ctype.c: Likewise.
	* locale/lc-monetary.c: Likewise.
	* locale/lc-time.c: Likewise.
	* locale/localeinfo.h: Likewise.
	* locale/newlocale.c: Likewise.
	* locale/setlocale.c: Likewise.
	* locale/weight.h: Likewise.
	* locale/findlocale.c: Unconditionally use mmap.
	Handle new categories.
	* locale/loadlocale.c: Likewise.
	* locale/iso-3166.def: New file.
	* locale/iso-639.def: New file.
	* locale/programs/charmap-kw.gperf: Add new keywords.
	* locale/programs/locfile-kw.gperf: Likewise.
	* locale/programs/locfile-token.h: Define new tokens.
	* locale/programs/charmap.c: Rewrite to handle multibyte charsets.
	* locale/programs/charmap.h: New file.
	* locale/programs/charset.h: Removed.
	* locale/programs/config.h: Add __LC_LAST.
	* locale/programs/lc-address.c: New file.
	* locale/programs/lc-identification.c: New file.
	* locale/programs/lc-measurement.c: New file.
	* locale/programs/lc-name.c: New file.
	* locale/programs/lc-paper.c: New file.
	* locale/programs/lc-telephone.c: New file.
	* locale/programs/lc-collate.c: Update for locale rewrite.
	* locale/programs/lc-ctype.c: Likewise.
	* locale/programs/lc-messages.c: Likewise.
	* locale/programs/lc-monetary.c: Likewise.
	* locale/programs/lc-numeric.c: Likewise.
	* locale/programs/lc-time.c: Likewise.
	* locale/programs/locale.c: Likewise.
	* locale/programs/localedef.c: Likewise.
	* locale/programs/locfile.c: Likewise.
	* locale/programs/repertoire.c: Likewise.
	* locale/programs/repertoire.h: Likewise.
	* locale/programs/locfile.c: Update prototypes.
	Update handle_copy definition.
	* locale/programs/linereader.c: Add handling of wide char strings and
	new definition file syntax.
	* locale/programs/linereader.h (struct token): Add elements for wide
	character strings.
	* locale/programs/locale-spec.c: Disable handling of collation
	elements for now.
	* locale/programs/simple-hash.h: Cleanup.
	* locale/programs/stringtrans.h: Handle quite of end of line.
	* string/strcoll.c: Fall back on strcmp for now.
	* string/strxfrm.c: Fall back on strncpy/strlen for now.
	* time/strftime.c: Use new wide character data for wcsftime.
	* time/strptime.c: Remove _nl_C_LC_TIME declaration.
	* wctype/cname-lookup.h: Update for new LC_CTYPE data.
1999-08-31 07:04:41 +00:00

777 lines
16 KiB
C

/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <ctype.h>
#include <errno.h>
#include <libintl.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include "charmap.h"
#include "error.h"
#include "linereader.h"
#include "localedef.h"
#include "stringtrans.h"
/* Prototypes for local functions. */
static struct token *get_toplvl_escape (struct linereader *lr);
static struct token *get_symname (struct linereader *lr);
static struct token *get_ident (struct linereader *lr);
static struct token *get_string (struct linereader *lr,
const struct charmap_t *charmap,
const struct repertoire_t *repertoire);
struct linereader *
lr_open (const char *fname, kw_hash_fct_t hf)
{
FILE *fp;
struct linereader *result;
int n;
if (fname == NULL || strcmp (fname, "-") == 0
|| strcmp (fname, "/dev/stdin") == 0)
fp = stdin;
else
{
fp = fopen (fname, "r");
if (fp == NULL)
return NULL;
}
result = (struct linereader *) xmalloc (sizeof (*result));
result->fp = fp;
result->fname = xstrdup (fname ? : "<stdin>");
result->buf = NULL;
result->bufsize = 0;
result->lineno = 1;
result->idx = 0;
result->comment_char = '#';
result->escape_char = '\\';
result->translate_strings = 1;
n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
if (n < 0)
{
int save = errno;
fclose (result->fp);
free ((char *) result->fname);
free (result);
errno = save;
return NULL;
}
if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
n -= 2;
result->buf[n] = '\0';
result->bufact = n;
result->hash_fct = hf;
return result;
}
int
lr_eof (struct linereader *lr)
{
return lr->bufact = 0;
}
void
lr_close (struct linereader *lr)
{
fclose (lr->fp);
free (lr->buf);
free (lr);
}
int
lr_next (struct linereader *lr)
{
int n;
n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
if (n < 0)
return -1;
++lr->lineno;
if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
{
#if 0
/* XXX Is this correct? */
/* An escaped newline character is substituted with a single <SP>. */
--n;
lr->buf[n - 1] = ' ';
#else
n -= 2;
#endif
}
lr->buf[n] = '\0';
lr->bufact = n;
lr->idx = 0;
return 0;
}
/* Defined in error.c. */
/* This variable is incremented each time `error' is called. */
extern unsigned int error_message_count;
/* The calling program should define program_name and set it to the
name of the executing program. */
extern char *program_name;
struct token *
lr_token (struct linereader *lr, const struct charmap_t *charmap,
const struct repertoire_t *repertoire)
{
int ch;
while (1)
{
do
{
ch = lr_getc (lr);
if (ch == EOF)
{
lr->token.tok = tok_eof;
return &lr->token;
};
if (ch == '\n')
{
lr->token.tok = tok_eol;
return &lr->token;
}
}
while (isspace (ch));
if (ch == EOF)
{
lr->token.tok = tok_eof;
return &lr->token;
};
if (ch != lr->comment_char)
break;
/* Ignore rest of line. */
lr_ignore_rest (lr, 0);
lr->token.tok = tok_eol;
return &lr->token;
}
/* Match escape sequences. */
if (ch == lr->escape_char)
return get_toplvl_escape (lr);
/* Match ellipsis. */
if (ch == '.')
{
if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
{
lr_getc (lr);
lr_getc (lr);
lr_getc (lr);
lr->token.tok = tok_ellipsis4;
return &lr->token;
}
if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
{
lr_getc (lr);
lr_getc (lr);
lr->token.tok = tok_ellipsis3;
return &lr->token;
}
if (lr->buf[lr->idx] == '.')
{
lr_getc (lr);
lr->token.tok = tok_ellipsis2;
return &lr->token;
}
}
switch (ch)
{
case '<':
return get_symname (lr);
case '0' ... '9':
lr->token.tok = tok_number;
lr->token.val.num = ch - '0';
while (isdigit (ch = lr_getc (lr)))
{
lr->token.val.num *= 10;
lr->token.val.num += ch - '0';
}
if (isalpha (ch))
lr_error (lr, _("garbage at end of number"));
lr_ungetn (lr, 1);
return &lr->token;
case ';':
lr->token.tok = tok_semicolon;
return &lr->token;
case ',':
lr->token.tok = tok_comma;
return &lr->token;
case '(':
lr->token.tok = tok_open_brace;
return &lr->token;
case ')':
lr->token.tok = tok_close_brace;
return &lr->token;
case '"':
return get_string (lr, charmap, repertoire);
case '-':
ch = lr_getc (lr);
if (ch == '1')
{
lr->token.tok = tok_minus1;
return &lr->token;
}
lr_ungetn (lr, 2);
break;
}
return get_ident (lr);
}
static struct token *
get_toplvl_escape (struct linereader *lr)
{
/* This is supposed to be a numeric value. We return the
numerical value and the number of bytes. */
size_t start_idx = lr->idx - 1;
char *bytes = lr->token.val.charcode.bytes;
int nbytes = 0;
int ch;
do
{
unsigned int byte = 0;
unsigned int base = 8;
ch = lr_getc (lr);
if (ch == 'd')
{
base = 10;
ch = lr_getc (lr);
}
else if (ch == 'x')
{
base = 16;
ch = lr_getc (lr);
}
if ((base == 16 && !isxdigit (ch))
|| (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
{
esc_error:
lr->token.val.str.startmb = &lr->buf[start_idx];
while (ch != EOF && !isspace (ch))
ch = lr_getc (lr);
lr->token.val.str.lenmb = lr->idx - start_idx;
lr->token.tok = tok_error;
return &lr->token;
}
if (isdigit (ch))
byte = ch - '0';
else
byte = tolower (ch) - 'a' + 10;
ch = lr_getc (lr);
if ((base == 16 && !isxdigit (ch))
|| (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
goto esc_error;
byte *= base;
if (isdigit (ch))
byte += ch - '0';
else
byte += tolower (ch) - 'a' + 10;
ch = lr_getc (lr);
if (base != 16 && isdigit (ch))
{
byte *= base;
byte += ch - '0';
ch = lr_getc (lr);
}
bytes[nbytes++] = byte;
}
while (ch == lr->escape_char && nbytes < 4);
if (!isspace (ch))
lr_error (lr, _("garbage at end of character code specification"));
lr_ungetn (lr, 1);
lr->token.tok = tok_charcode;
lr->token.val.charcode.nbytes = nbytes;
return &lr->token;
}
#define ADDC(ch) \
do \
{ \
if (bufact == bufmax) \
{ \
bufmax *= 2; \
buf = xrealloc (buf, bufmax); \
} \
buf[bufact++] = (ch); \
} \
while (0)
#define ADDS(s, l) \
do \
{ \
size_t _l = (l); \
if (bufact + _l > bufmax) \
{ \
if (bufact < _l) \
bufact = _l; \
bufmax *= 2; \
buf = xrealloc (buf, bufmax); \
} \
memcpy (&buf[bufact], s, _l); \
bufact += _l; \
} \
while (0)
#define ADDWC(ch) \
do \
{ \
if (buf2act == buf2max) \
{ \
buf2max *= 2; \
buf2 = xrealloc (buf2, buf2max * 4); \
} \
buf2[buf2act++] = (ch); \
} \
while (0)
static struct token *
get_symname (struct linereader *lr)
{
/* Symbol in brackets. We must distinguish three kinds:
1. reserved words
2. ISO 10646 position values
3. all other. */
char *buf;
size_t bufact = 0;
size_t bufmax = 56;
const struct keyword_t *kw;
int ch;
buf = (char *) xmalloc (bufmax);
do
{
ch = lr_getc (lr);
if (ch == lr->escape_char)
{
int c2 = lr_getc (lr);
ADDC (c2);
if (c2 == '\n')
ch = '\n';
}
else
ADDC (ch);
}
while (ch != '>' && ch != '\n');
if (ch == '\n')
lr_error (lr, _("unterminated symbolic name"));
/* Test for ISO 10646 position value. */
if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
{
char *cp = buf + 1;
while (cp < &buf[bufact - 1] && isxdigit (*cp))
++cp;
if (cp == &buf[bufact - 1])
{
/* Yes, it is. */
lr->token.tok = tok_ucs4;
lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
return &lr->token;
}
}
/* It is a symbolic name. Test for reserved words. */
kw = lr->hash_fct (buf, bufact - 1);
if (kw != NULL && kw->symname_or_ident == 1)
{
lr->token.tok = kw->token;
free (buf);
}
else
{
lr->token.tok = tok_bsymbol;
buf[bufact] = '\0';
buf = xrealloc (buf, bufact + 1);
lr->token.val.str.startmb = buf;
lr->token.val.str.lenmb = bufact - 1;
}
return &lr->token;
}
static struct token *
get_ident (struct linereader *lr)
{
char *buf;
size_t bufact;
size_t bufmax = 56;
const struct keyword_t *kw;
int ch;
buf = xmalloc (bufmax);
bufact = 0;
ADDC (lr->buf[lr->idx - 1]);
while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
&& ch != '<' && ch != ',')
{
if (ch == lr->escape_char)
{
ch = lr_getc (lr);
if (ch == '\n' || ch == EOF)
{
lr_error (lr, _("invalid escape sequence"));
break;
}
}
ADDC (ch);
}
lr_ungetn (lr, 1);
kw = lr->hash_fct (buf, bufact);
if (kw != NULL && kw->symname_or_ident == 0)
{
lr->token.tok = kw->token;
free (buf);
}
else
{
lr->token.tok = tok_ident;
buf[bufact] = '\0';
buf = xrealloc (buf, bufact + 1);
lr->token.val.str.startmb = buf;
lr->token.val.str.lenmb = bufact;
}
return &lr->token;
}
static struct token *
get_string (struct linereader *lr, const struct charmap_t *charmap,
const struct repertoire_t *repertoire)
{
int return_widestr = lr->return_widestr;
char *buf;
char *buf2 = NULL;
size_t bufact;
size_t bufmax = 56;
/* We must return two different strings. */
buf = xmalloc (bufmax);
bufact = 0;
/* We know it'll be a string. */
lr->token.tok = tok_string;
/* If we need not translate the strings (i.e., expand <...> parts)
we can run a simple loop. */
if (!lr->translate_strings)
{
int ch;
buf2 = NULL;
while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
ADDC (ch);
/* Catch errors with trailing escape character. */
if (bufact > 0 && buf[bufact - 1] == lr->escape_char
&& (bufact == 1 || buf[bufact - 2] != lr->escape_char))
{
lr_error (lr, _("illegal escape sequence at end of string"));
--bufact;
}
else if (ch == '\n' || ch == EOF)
lr_error (lr, _("unterminated string"));
ADDC ('\0');
}
else
{
int illegal_string = 0;
size_t buf2act = 0;
size_t buf2max = 56 * sizeof (uint32_t);
int ch;
int warned = 0;
/* We have to provide the wide character result as well. */
if (return_widestr)
buf2 = xmalloc (buf2max);
/* Read until the end of the string (or end of the line or file). */
while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
{
size_t startidx;
uint32_t wch;
struct charseq *seq;
if (ch != '<')
{
/* The standards leave it up to the implementation to decide
what to do with character which stand for themself. We
could jump through hoops to find out the value relative to
the charmap and the repertoire map, but instead we leave
it up to the locale definition author to write a better
definition. We assume here that every character which
stands for itself is encoded using ISO 8859-1. Using the
escape character is allowed. */
if (ch == lr->escape_char)
{
ch = lr_getc (lr);
if (ch == '\n' || ch == EOF)
break;
}
if (verbose && !warned)
{
lr_error (lr, _("\
non-symbolic character value should not be used"));
warned = 1;
}
ADDC (ch);
if (return_widestr)
ADDWC ((uint32_t) ch);
continue;
}
/* Now we have to search for the end of the symbolic name, i.e.,
the closing '>'. */
startidx = bufact;
while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
{
if (ch == lr->escape_char)
{
ch = lr_getc (lr);
if (ch == '\n' || ch == EOF)
break;
}
ADDC (ch);
}
if (ch == '\n' || ch == EOF)
/* Not a correct string. */
break;
if (bufact == startidx)
{
/* <> is no correct name. Ignore it and also signal an
error. */
illegal_string = 1;
continue;
}
/* It might be a Uxxxx symbol. */
if (buf[startidx] == 'U'
&& (bufact - startidx == 5 || bufact - startidx == 9))
{
char *cp = buf + startidx + 1;
while (cp < &buf[bufact] && isxdigit (*cp))
++cp;
if (cp == &buf[bufact])
{
const char *symbol = NULL;
/* Yes, it is. */
ADDC ('\0');
wch = strtoul (buf + startidx + 1, NULL, 16);
/* Now forget about the name we just added. */
bufact = startidx;
if (return_widestr)
ADDWC (wch);
/* Now determine from the repertoire the name of the
character and find it in the charmap. */
if (repertoire != NULL)
symbol = repertoire_find_symbol (repertoire, wch);
if (symbol == NULL)
{
/* We cannot generate a string since we cannot map
from the Unicode number to the character symbol. */
lr_error (lr,
_("character <U%0*X> not in repertoire map"),
wch > 0xffff ? 8 : 4, wch);
illegal_string = 1;
}
else
{
seq = charmap_find_value (charmap, symbol,
strlen (symbol));
if (seq == NULL)
{
/* Not a known name. */
lr_error (lr,
_("symbol `%s' not in charmap"), symbol);
illegal_string = 1;
}
else
ADDS (seq->bytes, seq->nbytes);
}
continue;
}
}
if (return_widestr)
{
/* We now have the symbolic name in buf[startidx] to
buf[bufact-1]. Now find out the value for this
character in the repertoire map as well as in the
charmap (in this order). */
wch = repertoire_find_value (repertoire, &buf[startidx],
bufact - startidx);
if (wch == ILLEGAL_CHAR_VALUE)
{
/* This name is not in the repertoire map. */
lr_error (lr, _("symbol `%.*s' not in repertoire map"),
bufact - startidx, &buf[startidx]);
illegal_string = 1;
}
else
ADDWC (wch);
}
/* Now the same for the multibyte representation. */
seq = charmap_find_value (charmap, &buf[startidx],
bufact - startidx);
if (seq == NULL)
{
/* This name is not in the charmap. */
lr_error (lr, _("symbol `%.*s' not in charmap"),
bufact - startidx, &buf[startidx]);
illegal_string = 1;
/* Now forget about the name we just added. */
bufact = startidx;
}
else
{
/* Now forget about the name we just added. */
bufact = startidx;
ADDS (seq->bytes, seq->nbytes);
}
}
if (ch == '\n' || ch == EOF)
{
lr_error (lr, _("unterminated string"));
illegal_string = 1;
}
if (illegal_string)
{
free (buf);
if (buf2 != NULL)
free (buf2);
lr->token.val.str.startmb = NULL;
lr->token.val.str.lenmb = 0;
return &lr->token;
}
ADDC ('\0');
if (return_widestr)
{
ADDWC (0);
lr->token.val.str.startwc = xrealloc (buf2,
buf2act * sizeof (uint32_t));
lr->token.val.str.lenwc = buf2act;
}
}
lr->token.val.str.startmb = xrealloc (buf, bufact);
lr->token.val.str.lenmb = bufact;
return &lr->token;
}