2000-07-05  Ulrich Drepper  <drepper@redhat.com>

	* locale/loadlocale.c (_nl_unload_locale): Add cast to avoid warning.

	* locale/programs/ld-collate.c (collate_output): Also write out the
	collation sequence values and the wide character string for the
	collation symbol table.

	* posix/fnmatch.c: Include "../locale/elem-hash.h".
	* posix/fnmatch_loop.c: Implement collating symbol handling.
	* posix/tst-fnmatch.input: Add more tests, especially for collating
	symbol handling.

	* posix/regex.c: Fix comment.
This commit is contained in:
Ulrich Drepper 2000-07-05 22:02:03 +00:00
parent 6e5f82858c
commit f3e29a1a0f
6 changed files with 498 additions and 68 deletions

View File

@ -1,3 +1,18 @@
2000-07-05 Ulrich Drepper <drepper@redhat.com>
* locale/loadlocale.c (_nl_unload_locale): Add cast to avoid warning.
* locale/programs/ld-collate.c (collate_output): Also write out the
collation sequence values and the wide character string for the
collation symbol table.
* posix/fnmatch.c: Include "../locale/elem-hash.h".
* posix/fnmatch_loop.c: Implement collating symbol handling.
* posix/tst-fnmatch.input: Add more tests, especially for collating
symbol handling.
* posix/regex.c: Fix comment.
2000-07-05 Andreas Jaeger <aj@suse.de>
* sysdeps/mips/fpu_control.h: Fix type of fpu_control_t.

View File

@ -2611,6 +2611,17 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
(sizeof (int32_t)
- ((1 + namelen + 1 + runp->nmbs)
% sizeof (int32_t))));
/* Now some 32-bit values: multibyte collation sequence,
wide char string (including length), and wide char
collation sequence. */
obstack_int_grow (&extrapool, runp->mbseqorder);
obstack_int_grow (&extrapool, runp->nwcs);
obstack_grow (&extrapool, runp->wcs,
runp->nwcs * sizeof (uint32_t));
obstack_int_grow (&extrapool, runp->wcseqorder);
}
}

View File

@ -53,6 +53,7 @@
we support a correct implementation only in glibc. */
#ifdef _LIBC
# include "../locale/localeinfo.h"
# include "../locale/elem-hash.h"
# define CONCAT(a,b) __CONCAT(a,b)
#endif

View File

@ -387,7 +387,10 @@ FCT (pattern, string, no_leading_period, flags)
const UCHAR *np = (const UCHAR *) n;
idx2 = findidx (&np);
# if !WIDE_CHAR_VERSION
# if WIDE_CHAR_VERSION
if (idx2 != 0 && weights[idx] == weights[idx2])
goto matched;
# else
if (idx2 != 0 && len == weights[idx2])
{
int cnt = 0;
@ -400,9 +403,6 @@ FCT (pattern, string, no_leading_period, flags)
if (cnt == len)
goto matched;
}
# else
if (idx2 != 0 && weights[idx] == weights[idx2])
goto matched;
# endif
}
}
@ -415,13 +415,187 @@ FCT (pattern, string, no_leading_period, flags)
return FNM_NOMATCH;
else
{
c = FOLD (c);
normal_bracket:
if (c == fn)
goto matched;
int is_seqval = 0;
int is_range = 0;
cold = c;
c = *p++;
#ifdef _LIBC
if (c == L('[') && *p == L('.'))
{
uint32_t nrules =
_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
const CHAR *startp = p;
size_t c1 = 0;
while (1)
{
c = *++p;
if (c == L('.') && p[1] == L(']'))
{
p += 2;
break;
}
if (c == '\0')
return FNM_NOMATCH;
++c1;
}
/* We have to handling the symbols differently in
ranges since then the collation sequence is
important. */
is_range = *p == L('-') && p[1] != L('\0');
if (nrules == 0)
{
/* There are no names defined in the collation
data. Therefore we only accept the trivial
names consisting of the character itself. */
if (c1 != 1)
return FNM_NOMATCH;
if (!is_range && *n == startp[1])
goto matched;
cold = startp[1];
c = *p++;
}
else
{
int32_t table_size;
const int32_t *symb_table;
# ifdef WIDE_CHAR_VERSION
char str[c1];
int strcnt;
# else
# define str (startp + 1)
# endif
const unsigned char *extra;
int32_t idx;
int32_t elem;
int32_t second;
int32_t hash;
# ifdef WIDE_CHAR_VERSION
/* We have to convert the name to a single-byte
string. This is possible since the names
consist of ASCII characters and the internal
representation is UCS4. */
for (strcnt = 0; strcnt < c1; ++strcnt)
str[strcnt] = startp[1 + strcnt];
#endif
table_size =
_NL_CURRENT_WORD (LC_COLLATE,
_NL_COLLATE_SYMB_HASH_SIZEMB);
symb_table = (const int32_t *)
_NL_CURRENT (LC_COLLATE,
_NL_COLLATE_SYMB_TABLEMB);
extra = (const unsigned char *)
_NL_CURRENT (LC_COLLATE,
_NL_COLLATE_SYMB_EXTRAMB);
/* Locate the character in the hashing table. */
hash = elem_hash (str, c1);
idx = 0;
elem = hash % table_size;
second = hash % (table_size - 2);
while (symb_table[2 * elem] != 0)
{
/* First compare the hashing value. */
if (symb_table[2 * elem] == hash
&& c1 == extra[symb_table[2 * elem + 1]]
&& memcmp (str,
&extra[symb_table[2 * elem + 1]
+ 1], c1) == 0)
{
/* Yep, this is the entry. */
idx = symb_table[2 * elem + 1];
idx += 1 + extra[idx];
break;
}
/* Next entry. */
elem += second;
}
if (symb_table[2 * elem] != 0)
{
/* Compare the byte sequence but only if
this is not part of a range. */
# ifdef WIDE_CHAR_VERSION
int32_t *wextra;
idx += 1 + extra[idx];
/* Adjust for the alignment. */
idx = (idx + 3) & ~4;
wextra = (int32_t *) &extra[idx + 4];
# endif
if (! is_range)
{
# ifdef WIDE_CHAR_VERSION
for (c1 = 0; c1 < wextra[idx]; ++c1)
if (n[c1] != wextra[1 + c1])
break;
if (c1 == wextra[idx])
goto matched;
# else
for (c1 = 0; c1 < extra[idx]; ++c1)
if (n[c1] != extra[1 + c1])
break;
if (c1 == extra[idx])
goto matched;
# endif
}
/* Get the collation sequence value. */
is_seqval = 1;
# ifdef WIDE_CHAR_VERSION
cold = wextra[1 + wextra[idx]];
# else
/* Adjust for the alignment. */
idx += 1 + extra[idx];
idx = (idx + 3) & ~4;
cold = *((int32_t *) &extra[idx]);
# endif
c = *p++;
}
else if (symb_table[2 * elem] != 0 && c1 == 1)
{
/* No valid character. Match it as a
single byte. */
if (!is_range && *n == str[0])
goto matched;
cold = str[0];
c = *p++;
}
else
return FNM_NOMATCH;
}
}
else
# undef str
#endif
{
c = FOLD (c);
normal_bracket:
/* We have to handling the symbols differently in
ranges since then the collation sequence is
important. */
is_range = *p == L('-') && p[1] != L('\0');
if (!is_range && c == fn)
goto matched;
cold = c;
c = *p++;
}
if (c == L('-') && *p != L(']'))
{
@ -434,23 +608,19 @@ FCT (pattern, string, no_leading_period, flags)
various characters appear in the source
file. A strange concept, nowhere
documented. */
int32_t fseqidx;
int32_t lseqidx;
uint32_t fcollseq;
uint32_t lcollseq;
UCHAR cend = *p++;
# ifdef WIDE_CHAR_VERSION
int idx;
size_t cnt;
# endif
if (!(flags & FNM_NOESCAPE) && cend == L('\\'))
cend = *p++;
if (cend == L('\0'))
return FNM_NOMATCH;
# ifdef WIDE_CHAR_VERSION
/* Search in the `names' array for the characters. */
fseqidx = fn % size;
idx = fn % size;
cnt = 0;
while (names[fseqidx] != fn)
while (names[idx] != fn)
{
if (++cnt == layers)
/* XXX We don't know anything about
@ -458,63 +628,210 @@ FCT (pattern, string, no_leading_period, flags)
match. This means we are failing. */
goto range_not_matched;
fseqidx += size;
idx += size;
}
lseqidx = cold % size;
cnt = 0;
while (names[lseqidx] != cold)
fcollseq = collseq[idx];
if (is_seqval)
lcollseq = cold;
else
{
if (++cnt == layers)
idx = cold % size;
cnt = 0;
while (names[idx] != cold)
{
lseqidx = -1;
break;
if (++cnt == layers)
{
idx = -1;
break;
}
idx += size;
}
lseqidx += size;
lcollseq = idx == -1 ? 0xffffffff : collseq[idx];
}
# else
fseqidx = fn;
lseqidx = cold;
fcollseq = collseq[fn];
lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
# endif
is_seqval = 0;
if (cend == L('[') && *p == L('.'))
{
uint32_t nrules =
_NL_CURRENT_WORD (LC_COLLATE,
_NL_COLLATE_NRULES);
const CHAR *startp = p;
size_t c1 = 0;
while (1)
{
c = *++p;
if (c == L('.') && p[1] == L(']'))
{
p += 2;
break;
}
if (c == '\0')
return FNM_NOMATCH;
++c1;
}
if (nrules == 0)
{
/* There are no names defined in the
collation data. Therefore we only
accept the trivial names consisting
of the character itself. */
if (c1 != 1)
return FNM_NOMATCH;
cend = startp[1];
}
else
{
int32_t table_size;
const int32_t *symb_table;
# ifdef WIDE_CHAR_VERSION
char str[c1];
int strcnt;
# else
# define str (startp + 1)
# endif
const unsigned char *extra;
int32_t idx;
int32_t elem;
int32_t second;
int32_t hash;
# ifdef WIDE_CHAR_VERSION
/* We have to convert the name to a single-byte
string. This is possible since the names
consist of ASCII characters and the internal
representation is UCS4. */
for (strcnt = 0; strcnt < c1; ++strcnt)
str[strcnt] = startp[1 + strcnt];
#endif
table_size =
_NL_CURRENT_WORD (LC_COLLATE,
_NL_COLLATE_SYMB_HASH_SIZEMB);
symb_table = (const int32_t *)
_NL_CURRENT (LC_COLLATE,
_NL_COLLATE_SYMB_TABLEMB);
extra = (const unsigned char *)
_NL_CURRENT (LC_COLLATE,
_NL_COLLATE_SYMB_EXTRAMB);
/* Locate the character in the hashing
table. */
hash = elem_hash (str, c1);
idx = 0;
elem = hash % table_size;
second = hash % (table_size - 2);
while (symb_table[2 * elem] != 0)
{
/* First compare the hashing value. */
if (symb_table[2 * elem] == hash
&& (c1
== extra[symb_table[2 * elem + 1]])
&& memcmp (str,
&extra[symb_table[2 * elem + 1]
+ 1], c1) == 0)
{
/* Yep, this is the entry. */
idx = symb_table[2 * elem + 1];
idx += 1 + extra[idx];
break;
}
/* Next entry. */
elem += second;
}
if (symb_table[2 * elem] != 0)
{
/* Compare the byte sequence but only if
this is not part of a range. */
# ifdef WIDE_CHAR_VERSION
int32_t *wextra;
idx += 1 + extra[idx];
/* Adjust for the alignment. */
idx = (idx + 3) & ~4;
wextra = (int32_t *) &extra[idx + 4];
# endif
/* Get the collation sequence value. */
is_seqval = 1;
# ifdef WIDE_CHAR_VERSION
cend = wextra[1 + wextra[idx]];
# else
/* Adjust for the alignment. */
idx += 1 + extra[idx];
idx = (idx + 3) & ~4;
cend = *((int32_t *) &extra[idx]);
# endif
}
else if (symb_table[2 * elem] != 0 && c1 == 1)
{
cend = str[0];
c = *p++;
}
else
return FNM_NOMATCH;
}
# undef str
}
else
{
if (!(flags & FNM_NOESCAPE) && cend == L('\\'))
cend = *p++;
if (cend == L('\0'))
return FNM_NOMATCH;
cend = FOLD (cend);
}
/* XXX It is not entirely clear to me how to handle
characters which are not mentioned in the
collation specification. */
if (
# ifdef WIDE_CHAR_VERSION
lseqidx == -1 ||
lcollseq == 0xffffffff ||
# endif
collseq[lseqidx] <= collseq[fseqidx])
lcollseq <= fcollseq)
{
/* We have to look at the upper bound. */
int32_t hseqidx;
uint32_t hcollseq;
cend = FOLD (cend);
# ifdef WIDE_CHAR_VERSION
hseqidx = cend % size;
cnt = 0;
while (names[hseqidx] != cend)
if (is_seqval)
hcollseq = cend;
else
{
if (++cnt == layers)
{
/* Hum, no information about the upper
bound. The matching succeeds if the
lower bound is matched exactly. */
if (lseqidx == -1 || cold != fn)
goto range_not_matched;
goto matched;
}
}
# else
hseqidx = cend;
# endif
if (
# ifdef WIDE_CHAR_VERSION
(lseqidx == -1
&& collseq[fseqidx] == collseq[hseqidx]) ||
idx = cend % size;
cnt = 0;
while (names[idx] != cend)
{
if (++cnt == layers)
{
/* Hum, no information about the upper
bound. The matching succeeds if the
lower bound is matched exactly. */
if (idx == -1 && lcollseq != fcollseq)
goto range_not_matched;
goto matched;
}
}
hcollseq = collseq[idx];
# else
hcollseq = collseq[cend];
# endif
collseq[fseqidx] <= collseq[hseqidx])
}
if (lcollseq <= hcollseq && fcollseq <= hcollseq)
goto matched;
}
# ifdef WIDE_CHAR_VERSION
@ -553,6 +870,7 @@ FCT (pattern, string, no_leading_period, flags)
/* Skip the rest of the [...] that already matched. */
do
{
ignore_next:
c = *p++;
if (c == L('\0'))
@ -568,12 +886,52 @@ FCT (pattern, string, no_leading_period, flags)
}
else if (c == L('[') && *p == L(':'))
{
do
if (*++p == L('\0'))
return FNM_NOMATCH;
while (*p != L(':') || p[1] == L(']'));
int c1 = 0;
const CHAR *startp = p;
while (1)
{
c = *++p;
if (++c1 == CHAR_CLASS_MAX_LENGTH)
return FNM_NOMATCH;
if (*p == L(':') && p[1] == L(']'))
break;
if (c < L('a') || c >= L('z'))
{
p = startp;
goto ignore_next;
}
}
p += 2;
c = *p;
c = *p++;
}
else if (c == L('[') && *p == L('='))
{
c = *++p;
if (c == L('\0'))
return FNM_NOMATCH;
c = *++p;
if (c != L('=') || p[1] != L(']'))
return FNM_NOMATCH;
p += 2;
c = *p++;
}
else if (c == L('[') && *p == L('.'))
{
++p;
while (1)
{
c = *++p;
if (c == '\0')
return FNM_NOMATCH;
if (*p == L('.') && p[1] == L(']'))
break;
}
p += 2;
c = *p++;
}
}
while (c != L(']'));

View File

@ -2690,7 +2690,7 @@ regex_compile (pattern, size, syntax, bufp)
PATFETCH (c);
/* Now add the multibyte character(s) we found
to the acceptabed list.
to the accept list.
XXX Note that this is not entirely correct.
we would have to match multibyte sequences

View File

@ -70,23 +70,34 @@ C "]" "[!a]" 0
C "]]" "[!a]]" 0
# B.6 012(C)
# *** implement [. .]
C "a" "[[.a.]]" 0
C "-" "[[.-.]]" 0
C "-" "[[.-.][.].]]" 0
C "-" "[[.].][.-.]]" 0
C "-" "[[.-.][=u=]]" 0
C "-" "[[.-.][:alpha:]]" 0
C "a" "[![.a.]]" NOMATCH
# B.6 013(C)
# *** implement [. .]
C "a" "[[.b.]]" NOMATCH
C "a" "[[.b.][.c.]]" NOMATCH
C "a" "[[.b.][=b=]]" NOMATCH
# B.6 014(C)
# *** implement [. .]
# B.6 015(C)
C "a" "[[=a=]]" 0
C "b" "[[=a=]b]" 0
C "b" "[[=a=][=b=]]" 0
C "a" "[[=a=][=b=]]" 0
C "a" "[[=a=][.b.]]" 0
C "a" "[[=a=][:digit:]]" 0
# B.6 016(C)
C "=" "[[=a=]b]" NOMATCH
C "]" "[[=a=]b]" NOMATCH
C "a" "[[=b=]]" NOMATCH
C "a" "[[=b=][=c=]]" NOMATCH
C "a" "[[=b=][.].]]" NOMATCH
C "a" "[[=b=][:digit:]]" NOMATCH
# B.6 017(C)
C "a" "[[:alnum:]]" 0
@ -225,6 +236,10 @@ C "a" "[[alpha]]" NOMATCH
C "a" "[[alpha:]]" NOMATCH
C "a]" "[[alpha]]" 0
C "a]" "[[alpha:]]" 0
C "a" "[[:alpha:][.b.]]" 0
C "a" "[[:alpha:][=b=]]" 0
C "a" "[[:alpha:][:digit:]]" 0
C "a" "[[:digit:][:alpha:]]" 0
# B.6 018(C)
C "a" "[a-c]" 0
@ -236,9 +251,28 @@ C "B" "[a-c]" NOMATCH
C "b" "[A-C]" NOMATCH
C "" "[a-c]" NOMATCH
C "as" "[a-ca-z]" NOMATCH
C "a" "[[.a.]-c]" 0
C "a" "[a-[.c.]]" 0
C "a" "[[.a.]-[.c.]]" 0
C "b" "[[.a.]-c]" 0
C "b" "[a-[.c.]]" 0
C "b" "[[.a.]-[.c.]]" 0
C "c" "[[.a.]-c]" 0
C "c" "[a-[.c.]]" 0
C "c" "[[.a.]-[.c.]]" 0
C "d" "[[.a.]-c]" NOMATCH
C "d" "[a-[.c.]]" NOMATCH
C "d" "[[.a.]-[.c.]]" NOMATCH
# B.6 019(C)
C "b" "[c-a]" NOMATCH
C "a" "[c-a]" NOMATCH
C "a" "[[.c.]-a]" NOMATCH
C "a" "[c-[.a.]]" NOMATCH
C "a" "[[.c.]-[.a.]]" NOMATCH
C "c" "[c-a]" NOMATCH
C "c" "[[.c.]-a]" NOMATCH
C "c" "[c-[.a.]]" NOMATCH
C "c" "[[.c.]-[.a.]]" NOMATCH
# B.6 020(C)
C "a" "[a-c0-9]" 0
@ -394,23 +428,34 @@ de_DE.ISO-8859-1 "a" "[[=a=]b]" 0
de_DE.ISO-8859-1 "â" "[[=a=]b]" 0
de_DE.ISO-8859-1 "à" "[[=a=]b]" 0
de_DE.ISO-8859-1 "á" "[[=a=]b]" 0
de_DE.ISO-8859-1 "ä" "[[=a=]b]" 0
de_DE.ISO-8859-1 "b" "[[=a=]b]" 0
de_DE.ISO-8859-1 "c" "[[=a=]b]" NOMATCH
de_DE.ISO-8859-1 "a" "[[=â=]b]" 0
de_DE.ISO-8859-1 "â" "[[=â=]b]" 0
de_DE.ISO-8859-1 "à" "[[=â=]b]" 0
de_DE.ISO-8859-1 "á" "[[=â=]b]" 0
de_DE.ISO-8859-1 "ä" "[[=â=]b]" 0
de_DE.ISO-8859-1 "b" "[[=â=]b]" 0
de_DE.ISO-8859-1 "c" "[[=â=]b]" NOMATCH
de_DE.ISO-8859-1 "a" "[[=à=]b]" 0
de_DE.ISO-8859-1 "â" "[[=à=]b]" 0
de_DE.ISO-8859-1 "à" "[[=à=]b]" 0
de_DE.ISO-8859-1 "á" "[[=à=]b]" 0
de_DE.ISO-8859-1 "ä" "[[=à=]b]" 0
de_DE.ISO-8859-1 "b" "[[=à=]b]" 0
de_DE.ISO-8859-1 "c" "[[=à=]b]" NOMATCH
de_DE.ISO-8859-1 "a" "[[=á=]b]" 0
de_DE.ISO-8859-1 "â" "[[=á=]b]" 0
de_DE.ISO-8859-1 "à" "[[=á=]b]" 0
de_DE.ISO-8859-1 "á" "[[=á=]b]" 0
de_DE.ISO-8859-1 "ä" "[[=á=]b]" 0
de_DE.ISO-8859-1 "b" "[[=á=]b]" 0
de_DE.ISO-8859-1 "c" "[[=á=]b]" NOMATCH
de_DE.ISO-8859-1 "a" "[[=ä=]b]" 0
de_DE.ISO-8859-1 "â" "[[=ä=]b]" 0
de_DE.ISO-8859-1 "à" "[[=ä=]b]" 0
de_DE.ISO-8859-1 "á" "[[=ä=]b]" 0
de_DE.ISO-8859-1 "ä" "[[=ä=]b]" 0
de_DE.ISO-8859-1 "b" "[[=ä=]b]" 0
de_DE.ISO-8859-1 "c" "[[=ä=]b]" NOMATCH