Rewrite iconv option parsing [BZ #19519]

This commit replaces string manipulation during `iconv_open' and iconv_prog
option parsing with a structured, flag based conversion specification.  In
doing so, it alters the internal `__gconv_open' interface and accordingly
adjusts its uses.

This change fixes several hangs in the iconv program and therefore includes
a new test to exercise iconv_prog options that originally led to these hangs.
It also includes a new regression test for option handling in the iconv
function.

Reviewed-by: Florian Weimer <fweimer@redhat.com>
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
This commit is contained in:
Arjun Shankar 2020-07-07 20:31:48 +02:00
parent 94d9c76e4a
commit 91927b7c76
11 changed files with 989 additions and 144 deletions

View File

@ -26,7 +26,7 @@ headers = iconv.h gconv.h
routines = iconv_open iconv iconv_close \
gconv_open gconv gconv_close gconv_db gconv_conf \
gconv_builtin gconv_simple gconv_trans gconv_cache
routines += gconv_dl
routines += gconv_dl gconv_charset
vpath %.c ../locale/programs ../intl
@ -44,7 +44,7 @@ CFLAGS-linereader.c += -DNO_TRANSLITERATION
CFLAGS-simple-hash.c += -I../locale
tests = tst-iconv1 tst-iconv2 tst-iconv3 tst-iconv4 tst-iconv5 tst-iconv6 \
tst-iconv7 tst-iconv-mt
tst-iconv7 tst-iconv-mt tst-iconv-opt
others = iconv_prog iconvconfig
install-others-programs = $(inst_bindir)/iconv
@ -61,6 +61,7 @@ include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
ifeq ($(run-built-tests),yes)
xtests-special += $(objpfx)test-iconvconfig.out
tests-special += $(objpfx)tst-iconv_prog.out
endif
# Make a copy of the file because gconv module names are constructed
@ -81,6 +82,13 @@ endif
include ../Rules
ifeq ($(run-built-tests),yes)
LOCALES := en_US.UTF-8
include ../gen-locales.mk
$(objpfx)tst-iconv-opt.out: $(gen-locales)
endif
$(inst_bindir)/iconv: $(objpfx)iconv_prog $(+force)
$(do-install-program)
@ -95,3 +103,8 @@ $(objpfx)test-iconvconfig.out: /dev/null $(objpfx)iconvconfig
cmp $$tmp $(inst_gconvdir)/gconv-modules.cache; \
rm -f $$tmp) > $@; \
$(evaluate-test)
$(objpfx)tst-iconv_prog.out: tst-iconv_prog.sh $(objpfx)iconv_prog
$(BASH) $< $(common-objdir) '$(test-wrapper-env)' \
'$(run-program-env)' > $@; \
$(evaluate-test)

View File

@ -6,6 +6,7 @@ libc {
GLIBC_PRIVATE {
# functions shared with iconv program
__gconv_get_alias_db; __gconv_get_cache; __gconv_get_modules_db;
__gconv_open; __gconv_create_spec;
# function used by the gconv modules
__gconv_transliterate;

218
iconv/gconv_charset.c Normal file
View File

@ -0,0 +1,218 @@
/* Charset name normalization.
Copyright (C) 2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <stdbool.h>
#include <string.h>
#include <sys/stat.h>
#include "gconv_int.h"
#include "gconv_charset.h"
/* This function returns a pointer to the last suffix in a conversion code
string. Valid suffixes matched by this function are of the form: '/' or ','
followed by arbitrary text that doesn't contain '/' or ','. It does not
edit the string in any way. The caller is expected to parse the suffix and
remove it (by e.g. truncating the string) before the next call. */
static char *
find_suffix (char *s)
{
/* The conversion code is in the form of a triplet, separated by '/' chars.
The third component of the triplet contains suffixes. If we don't have two
slashes, we don't have a suffix. */
int slash_count = 0;
char *suffix_term = NULL;
for (int i = 0; s[i] != '\0'; i++)
switch (s[i])
{
case '/':
slash_count++;
/* Fallthrough */
case ',':
suffix_term = &s[i];
}
if (slash_count >= 2)
return suffix_term;
return NULL;
}
struct gconv_parsed_code
{
char *code;
bool translit;
bool ignore;
};
/* This function parses an iconv_open encoding PC.CODE, strips any suffixes
(such as TRANSLIT or IGNORE) from it and sets corresponding flags in it. */
static void
gconv_parse_code (struct gconv_parsed_code *pc)
{
pc->translit = false;
pc->ignore = false;
while (1)
{
/* First drop any trailing whitespaces and separators. */
size_t len = strlen (pc->code);
while ((len > 0)
&& (isspace (pc->code[len - 1])
|| pc->code[len - 1] == ','
|| pc->code[len - 1] == '/'))
len--;
pc->code[len] = '\0';
if (len == 0)
return;
char * suffix = find_suffix (pc->code);
if (suffix == NULL)
{
/* At this point, we have processed and removed all suffixes from the
code and what remains of the code is suffix free. */
return;
}
else
{
/* A suffix is processed from the end of the code array going
backwards, one suffix at a time. The suffix is an index into the
code character array and points to: one past the end of the code
and any unprocessed suffixes, and to the beginning of the suffix
currently being processed during this iteration. We must process
this suffix and then drop it from the code by terminating the
preceding text with NULL.
We want to allow and recognize suffixes such as:
"/TRANSLIT" i.e. single suffix
"//TRANSLIT" i.e. single suffix and multiple separators
"//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
"/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
"//IGNORE,TRANSLIT" i.e. suffixes separated by ","
"//IGNORE," i.e. trailing ","
"//TRANSLIT/" i.e. trailing "/"
"//TRANSLIT//" i.e. trailing "//"
"/" i.e. empty suffix.
Unknown suffixes are silently discarded and ignored. */
if ((__strcasecmp_l (suffix,
GCONV_TRIPLE_SEPARATOR
GCONV_TRANSLIT_SUFFIX,
_nl_C_locobj_ptr) == 0)
|| (__strcasecmp_l (suffix,
GCONV_SUFFIX_SEPARATOR
GCONV_TRANSLIT_SUFFIX,
_nl_C_locobj_ptr) == 0))
pc->translit = true;
if ((__strcasecmp_l (suffix,
GCONV_TRIPLE_SEPARATOR
GCONV_IGNORE_ERRORS_SUFFIX,
_nl_C_locobj_ptr) == 0)
|| (__strcasecmp_l (suffix,
GCONV_SUFFIX_SEPARATOR
GCONV_IGNORE_ERRORS_SUFFIX,
_nl_C_locobj_ptr) == 0))
pc->ignore = true;
/* We just processed this suffix. We can now drop it from the
code string by truncating it at the suffix's position. */
suffix[0] = '\0';
}
}
}
/* This function accepts the charset names of the source and destination of the
conversion and populates *conv_spec with an equivalent conversion
specification that may later be used by __gconv_open. The charset names
might contain options in the form of suffixes that alter the conversion,
e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
and truncating any suffix options in fromcode, and processing and truncating
any suffix options in tocode. Supported suffix options ("TRANSLIT" or
"IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
to be set to true. Unrecognized suffix options are silently discarded. If
the function succeeds, it returns conv_spec back to the caller. It returns
NULL upon failure. conv_spec must be allocated and freed by the caller. */
struct gconv_spec *
__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
const char *tocode)
{
struct gconv_parsed_code pfc, ptc;
struct gconv_spec *ret = NULL;
pfc.code = __strdup (fromcode);
ptc.code = __strdup (tocode);
if ((pfc.code == NULL)
|| (ptc.code == NULL))
goto out;
gconv_parse_code (&pfc);
gconv_parse_code (&ptc);
/* We ignore suffixes in the fromcode because that is how the current
implementation has always handled them. Only suffixes in the tocode are
processed and handled. The reality is that invalid input in the input
character set should only be ignored if the fromcode specifies IGNORE.
The current implementation ignores invalid intput in the input character
set if the tocode contains IGNORE. We preserve this behavior for
backwards compatibility. In the future we may split the handling of
IGNORE to allow a finer grained specification of ignorning invalid input
and/or ignoring invalid output. */
conv_spec->translit = ptc.translit;
conv_spec->ignore = ptc.ignore;
/* 3 extra bytes because 1 extra for '\0', and 2 extra so strip might
be able to add one or two trailing '/' characters if necessary. */
conv_spec->fromcode = malloc (strlen (fromcode) + 3);
if (conv_spec->fromcode == NULL)
goto out;
conv_spec->tocode = malloc (strlen (tocode) + 3);
if (conv_spec->tocode == NULL)
{
free (conv_spec->fromcode);
conv_spec->fromcode = NULL;
goto out;
}
/* Strip unrecognized characters and ensure that the code has two '/'
characters as per conversion code triplet specification. */
strip (conv_spec->fromcode, pfc.code);
strip (conv_spec->tocode, ptc.code);
ret = conv_spec;
out:
free (pfc.code);
free (ptc.code);
return ret;
}
libc_hidden_def (__gconv_create_spec)

View File

@ -19,9 +19,68 @@
#include <ctype.h>
#include <locale.h>
#include <stdbool.h>
#include <string.h>
#include <sys/stat.h>
#include <stdlib.h>
#include "gconv_int.h"
static void
/* An iconv encoding is in the form of a triplet, with parts separated by
a '/' character. The first part is the standard name, the second part is
the character set, and the third part is the error handler. If the first
part is sufficient to identify both the standard and the character set
then the second part can be empty e.g. UTF-8//. If the first part is not
sufficient to identify both the standard and the character set then the
second part is required e.g. ISO-10646/UTF8/. If neither the first or
second parts are provided e.g. //, then the current locale is used.
The actual values used in the first and second parts are not entirely
relevant to the implementation. The values themselves are used in a hash
table to lookup modules and so the naming convention of the first two parts
is somewhat arbitrary and only helps locate the entries in the cache.
The third part is the error handler and is comprised of a ',' or '/'
separated list of suffixes. Currently, we support "TRANSLIT" for
transliteration and "IGNORE" for ignoring conversion errors due to
unrecognized input characters. */
#define GCONV_TRIPLE_SEPARATOR "/"
#define GCONV_SUFFIX_SEPARATOR ","
#define GCONV_TRANSLIT_SUFFIX "TRANSLIT"
#define GCONV_IGNORE_ERRORS_SUFFIX "IGNORE"
/* This function accepts the charset names of the source and destination of the
conversion and populates *conv_spec with an equivalent conversion
specification that may later be used by __gconv_open. The charset names
might contain options in the form of suffixes that alter the conversion,
e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
and truncating any suffix options in fromcode, and processing and truncating
any suffix options in tocode. Supported suffix options ("TRANSLIT" or
"IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
to be set to true. Unrecognized suffix options are silently discarded. If
the function succeeds, it returns conv_spec back to the caller. It returns
NULL upon failure. */
struct gconv_spec *
__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
const char *tocode);
libc_hidden_proto (__gconv_create_spec)
/* This function frees all heap memory allocated by __gconv_create_spec. */
static void __attribute__ ((unused))
gconv_destroy_spec (struct gconv_spec *conv_spec)
{
free (conv_spec->fromcode);
free (conv_spec->tocode);
return;
}
/* This function copies in-order, characters from the source 's' that are
either alpha-numeric or one in one of these: "_-.,:/" - into the destination
'wp' while dropping all other characters. In the process, it converts all
alphabetical characters to upper case. It then appends up to two '/'
characters so that the total number of '/'es in the destination is 2. */
static inline void __attribute__ ((unused, always_inline))
strip (char *wp, const char *s)
{
int slash_count = 0;

View File

@ -75,6 +75,15 @@ struct gconv_module
};
/* The specification of the conversion that needs to be performed. */
struct gconv_spec
{
char *fromcode;
char *tocode;
bool translit;
bool ignore;
};
/* Flags for `gconv_open'. */
enum
{
@ -136,10 +145,12 @@ __libc_lock_define (extern, __gconv_lock attribute_hidden)
})
/* Return in *HANDLE decriptor for transformation from FROMSET to TOSET. */
extern int __gconv_open (const char *toset, const char *fromset,
__gconv_t *handle, int flags)
attribute_hidden;
/* Return in *HANDLE, a decriptor for the transformation. The function expects
the specification of the transformation in the structure pointed to by
CONV_SPEC. It only reads *CONV_SPEC and does not take ownership of it. */
extern int __gconv_open (struct gconv_spec *conv_spec,
__gconv_t *handle, int flags);
libc_hidden_proto (__gconv_open)
/* Free resources associated with transformation descriptor CD. */
extern int __gconv_close (__gconv_t cd)

View File

@ -31,7 +31,7 @@
int
__gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
__gconv_open (struct gconv_spec *conv_spec, __gconv_t *handle,
int flags)
{
struct __gconv_step *steps;
@ -40,77 +40,38 @@ __gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
size_t cnt = 0;
int res;
int conv_flags = 0;
const char *errhand;
const char *ignore;
bool translit = false;
char *tocode, *fromcode;
/* Find out whether any error handling method is specified. */
errhand = strchr (toset, '/');
if (errhand != NULL)
errhand = strchr (errhand + 1, '/');
if (__glibc_likely (errhand != NULL))
{
if (*++errhand == '\0')
errhand = NULL;
else
{
/* Make copy without the error handling description. */
char *newtoset = (char *) alloca (errhand - toset + 1);
char *tok;
char *ptr = NULL /* Work around a bogus warning */;
translit = conv_spec->translit;
newtoset[errhand - toset] = '\0';
toset = memcpy (newtoset, toset, errhand - toset);
if (conv_spec->ignore)
conv_flags |= __GCONV_IGNORE_ERRORS;
/* Find the appropriate transliteration handlers. */
tok = strdupa (errhand);
tok = __strtok_r (tok, ",", &ptr);
while (tok != NULL)
{
if (__strcasecmp_l (tok, "TRANSLIT", _nl_C_locobj_ptr) == 0)
translit = true;
else if (__strcasecmp_l (tok, "IGNORE", _nl_C_locobj_ptr) == 0)
/* Set the flag to ignore all errors. */
conv_flags |= __GCONV_IGNORE_ERRORS;
tok = __strtok_r (NULL, ",", &ptr);
}
}
}
/* For the source character set we ignore the error handler specification.
XXX Is this really always the best? */
ignore = strchr (fromset, '/');
if (ignore != NULL && (ignore = strchr (ignore + 1, '/')) != NULL
&& *++ignore != '\0')
{
char *newfromset = (char *) alloca (ignore - fromset + 1);
newfromset[ignore - fromset] = '\0';
fromset = memcpy (newfromset, fromset, ignore - fromset);
}
tocode = conv_spec->tocode;
fromcode = conv_spec->fromcode;
/* If the string is empty define this to mean the charset of the
currently selected locale. */
if (strcmp (toset, "//") == 0)
if (strcmp (tocode, "//") == 0)
{
const char *codeset = _NL_CURRENT (LC_CTYPE, CODESET);
size_t len = strlen (codeset);
char *dest;
toset = dest = (char *) alloca (len + 3);
tocode = dest = (char *) alloca (len + 3);
memcpy (__mempcpy (dest, codeset, len), "//", 3);
}
if (strcmp (fromset, "//") == 0)
if (strcmp (fromcode, "//") == 0)
{
const char *codeset = _NL_CURRENT (LC_CTYPE, CODESET);
size_t len = strlen (codeset);
char *dest;
fromset = dest = (char *) alloca (len + 3);
fromcode = dest = (char *) alloca (len + 3);
memcpy (__mempcpy (dest, codeset, len), "//", 3);
}
res = __gconv_find_transform (toset, fromset, &steps, &nsteps, flags);
res = __gconv_find_transform (tocode, fromcode, &steps, &nsteps, flags);
if (res == __GCONV_OK)
{
/* Allocate room for handle. */
@ -209,3 +170,4 @@ __gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
*handle = result;
return res;
}
libc_hidden_def (__gconv_open)

View File

@ -31,49 +31,15 @@
iconv_t
iconv_open (const char *tocode, const char *fromcode)
{
/* Normalize the name. We remove all characters beside alpha-numeric,
'_', '-', '/', '.', and ':'. */
size_t tocode_len = strlen (tocode) + 3;
char *tocode_conv;
bool tocode_usealloca = __libc_use_alloca (tocode_len);
if (tocode_usealloca)
tocode_conv = (char *) alloca (tocode_len);
else
{
tocode_conv = (char *) malloc (tocode_len);
if (tocode_conv == NULL)
return (iconv_t) -1;
}
strip (tocode_conv, tocode);
tocode = (tocode_conv[2] == '\0' && tocode[0] != '\0'
? upstr (tocode_conv, tocode) : tocode_conv);
size_t fromcode_len = strlen (fromcode) + 3;
char *fromcode_conv;
bool fromcode_usealloca = __libc_use_alloca (fromcode_len);
if (fromcode_usealloca)
fromcode_conv = (char *) alloca (fromcode_len);
else
{
fromcode_conv = (char *) malloc (fromcode_len);
if (fromcode_conv == NULL)
{
if (! tocode_usealloca)
free (tocode_conv);
return (iconv_t) -1;
}
}
strip (fromcode_conv, fromcode);
fromcode = (fromcode_conv[2] == '\0' && fromcode[0] != '\0'
? upstr (fromcode_conv, fromcode) : fromcode_conv);
__gconv_t cd;
int res = __gconv_open (tocode, fromcode, &cd, 0);
struct gconv_spec conv_spec;
if (! fromcode_usealloca)
free (fromcode_conv);
if (! tocode_usealloca)
free (tocode_conv);
if (__gconv_create_spec (&conv_spec, fromcode, tocode) == NULL)
return (iconv_t) -1;
int res = __gconv_open (&conv_spec, &cd, 0);
gconv_destroy_spec (&conv_spec);
if (__builtin_expect (res, __GCONV_OK) != __GCONV_OK)
{

View File

@ -39,6 +39,7 @@
#include <gconv_int.h>
#include "iconv_prog.h"
#include "iconvconfig.h"
#include "gconv_charset.h"
/* Get libc version number. */
#include "../version.h"
@ -118,8 +119,7 @@ main (int argc, char *argv[])
{
int status = EXIT_SUCCESS;
int remaining;
iconv_t cd;
const char *orig_to_code;
__gconv_t cd;
struct charmap_t *from_charmap = NULL;
struct charmap_t *to_charmap = NULL;
@ -139,39 +139,6 @@ main (int argc, char *argv[])
exit (EXIT_SUCCESS);
}
/* If we have to ignore errors make sure we use the appropriate name for
the to-character-set. */
orig_to_code = to_code;
if (omit_invalid)
{
const char *errhand = strchrnul (to_code, '/');
int nslash = 2;
char *newp;
char *cp;
if (*errhand == '/')
{
--nslash;
errhand = strchrnul (errhand + 1, '/');
if (*errhand == '/')
{
--nslash;
errhand = strchr (errhand, '\0');
}
}
newp = (char *) alloca (errhand - to_code + nslash + 7 + 1);
cp = mempcpy (newp, to_code, errhand - to_code);
while (nslash-- > 0)
*cp++ = '/';
if (cp[-1] != '/')
*cp++ = ',';
memcpy (cp, "IGNORE", sizeof ("IGNORE"));
to_code = newp;
}
/* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
can be file names of charmaps. In this case iconv will have to read
those charmaps and use them to do the conversion. But there are
@ -184,10 +151,10 @@ main (int argc, char *argv[])
file. */
from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
if (strchr (orig_to_code, '/') != NULL)
if (strchr (to_code, '/') != NULL)
/* The to-name might be a charmap file name. Try reading the
file. */
to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0);
to_charmap = charmap_read (to_code, /*0, 1,*/1, 0, 0, 0);
/* At this point we have to handle two cases. The first one is
@ -201,9 +168,25 @@ main (int argc, char *argv[])
argc, remaining, argv, output_file);
else
{
struct gconv_spec conv_spec;
int res;
if (__gconv_create_spec (&conv_spec, from_code, to_code) == NULL)
{
error (EXIT_FAILURE, errno,
_("failed to start conversion processing"));
exit (1);
}
if (omit_invalid)
conv_spec.ignore = true;
/* Let's see whether we have these coded character sets. */
cd = iconv_open (to_code, from_code);
if (cd == (iconv_t) -1)
res = __gconv_open (&conv_spec, &cd, 0);
gconv_destroy_spec (&conv_spec);
if (res != __GCONV_OK)
{
if (errno == EINVAL)
{
@ -221,7 +204,7 @@ main (int argc, char *argv[])
const char *from_pretty =
(from_code[0] ? from_code : nl_langinfo (CODESET));
const char *to_pretty =
(orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
(to_code[0] ? to_code : nl_langinfo (CODESET));
if (from_wrong)
{

347
iconv/tst-iconv-opt.c Normal file
View File

@ -0,0 +1,347 @@
/* Test iconv's TRANSLIT and IGNORE option handling
Copyright (C) 2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <iconv.h>
#include <locale.h>
#include <errno.h>
#include <string.h>
#include <support/support.h>
#include <support/check.h>
/* Run one iconv test. Arguments:
to: destination character set and options
from: source character set
input: input string to be converted
exp_in: expected number of bytes consumed
exp_ret: expected return value (error or number of irreversible conversions)
exp_out: expected output string
exp_err: expected value of `errno' after iconv returns. */
static void
test_iconv (const char *to, const char *from, char *input, size_t exp_in,
size_t exp_ret, const char *exp_out, int exp_err)
{
iconv_t cd;
char outbuf[500];
size_t inlen, outlen;
char *inptr, *outptr;
size_t n;
cd = iconv_open (to, from);
TEST_VERIFY (cd != (iconv_t) -1);
inlen = strlen (input);
outlen = sizeof (outbuf);
inptr = input;
outptr = outbuf;
errno = 0;
n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
TEST_COMPARE (n, exp_ret);
TEST_VERIFY (inptr == input + exp_in);
TEST_COMPARE (errno, exp_err);
TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out));
TEST_VERIFY (iconv_close (cd) == 0);
}
/* We test option parsing by converting UTF-8 inputs to ASCII under various
option combinations. The UTF-8 inputs fall into three categories:
- ASCII-only,
- non-ASCII,
- non-ASCII with invalid UTF-8 characters. */
/* 1. */
char ascii[] = "Just some ASCII text";
/* 2. Valid UTF-8 input and some corresponding expected outputs with various
options. The two non-ASCII characters below are accented alphabets:
an `a' then an `o'. */
char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
char u2a[] = "UTF-8 text with ";
char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters";
char u2a_ignore[] = "UTF-8 text with couple f non-ASCII characters";
/* 3. Invalid UTF-8 input and some corresponding expected outputs. \xff is
invalid UTF-8. It's followed by some valid but non-ASCII UTF-8. */
char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
char iu2a[] = "Invalid UTF-8 ";
char iu2a_ignore[] = "Invalid UTF-8 text";
char iu2a_both[] = "Invalid UTF-8 [|text|]";
/* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
the valid non-ASCII UTF-8 characters appear before the invalid \xff. */
char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
char ju2a[] = "Invalid ";
char ju2a_translit[] = "Invalid [|UTF-8|] ";
char ju2a_ignore[] = "Invalid UTF-8 text";
char ju2a_both[] = "Invalid [|UTF-8|] text";
/* We also test option handling for character set names that have the form
"A/B". In this test, we test conversions "ISO-10646/UTF-8", and either
ISO-8859-1 or ASCII. */
/* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
equivalent ASCII transliteration. */
char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's. */
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's. */
0x00};
char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
"\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
char ascii_a[] = "AAAAAAaaaaaa";
/* 6. An invalid ASCII string where [0] is invalid and [1] is '~'. */
char iascii [] = {0x80, '~', '\0'};
char empty[] = "";
char ia2u_ignore[] = "~";
static int
do_test (void)
{
xsetlocale (LC_ALL, "en_US.UTF-8");
/* 0. iconv_open should gracefully fail for invalid character sets. */
TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1);
TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1);
TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1);
/* 1. ASCII-only UTF-8 input should convert to ASCII with no changes: */
test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii,
0);
test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
/* 2. Valid UTF-8 input with non-ASCII characters: */
/* EILSEQ when converted to ASCII. */
test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ);
/* Converted without error with TRANSLIT enabled. */
test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit,
0);
/* EILSEQ with IGNORE enabled. Non-ASCII chars dropped from output. */
test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
u2a_ignore, EILSEQ);
/* With TRANSLIT and IGNORE enabled, transliterated without error. We test
four combinations. */
test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
/* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
/* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
works while respecting any other correctly spelled options. */
test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
EILSEQ);
test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1,
u2a, EILSEQ);
test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
EILSEQ);
test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
EILSEQ);
test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a),
(size_t) -1, u2a, EILSEQ);
test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a),
(size_t) -1, u2a, EILSEQ);
test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
EILSEQ);
test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
/* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2,
u2a_translit, 0);
test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1,
u2a_ignore, EILSEQ);
test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
u2a_ignore, EILSEQ);
/* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8),
(size_t) -1, u2a_ignore, EILSEQ);
test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8),
(size_t) -1, u2a_ignore, EILSEQ);
/* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters: */
/* EILSEQ; output is truncated at the first invalid UTF-8 character. */
test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a,
EILSEQ);
/* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
UTF-8 character. */
test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1,
iu2a, EILSEQ);
/* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
valid UTF-8 non-ASCII characters. */
test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1,
iu2a_ignore, EILSEQ);
/* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
characters and transliterates valid non-ASCII UTF-8 characters. We test
four combinations. */
test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
iu2a_both, 0);
/* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
iu2a_both, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
iu2a_both, 0);
/* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
iu2a_both, 0);
/* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first: */
/* EILSEQ; output is truncated at the first non-ASCII character. */
test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a,
EILSEQ);
/* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
UTF-8 character. */
test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5,
(size_t) -1, ju2a_translit, EILSEQ);
test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5,
(size_t) -1, ju2a_translit, EILSEQ);
/* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
valid UTF-8 non-ASCII characters. */
test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
ju2a_ignore, EILSEQ);
test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
ju2a_ignore, EILSEQ);
/* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
characters and transliterates valid non-ASCII UTF-8 characters. We test
several combinations. */
test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
/* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
/* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
/* Trailing whitespace and separators should be ignored. */
test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2,
ju2a_both, 0);
/* TRANSLIT or IGNORE suffixes in fromcode should be ignored. */
test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1,
ju2a, EILSEQ);
test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1,
ju2a, EILSEQ);
test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a),
(size_t) -1, ju2a, EILSEQ);
/* 5. Charset names of the form "A/B/": */
/* ISO-8859-1 is converted to UTF-8 without needing transliteration. */
test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a,
strlen (iso8859_1_a), 0, utf8_a, 0);
/* UTF-8 with accented A's is converted to ASCII with transliteration. */
test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a,
0, (size_t) -1, empty, EILSEQ);
test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a,
strlen (utf8_a), (size_t) -1, empty, EILSEQ);
test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a,
strlen (utf8_a), 12, ascii_a, 0);
/* Invalid ASCII is converted to UTF-8 only with IGNORE. */
test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1,
empty, EILSEQ);
test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty),
(size_t) -1, empty, EILSEQ);
test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii),
(size_t) -1, ia2u_ignore, EILSEQ);
test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii,
strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
/* Due to bug 19519, iconv was ignoring IGNORE for the following three
inputs: */
test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii,
strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii,
strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii,
strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
return 0;
}
#include <support/test-driver.c>

280
iconv/tst-iconv_prog.sh Normal file
View File

@ -0,0 +1,280 @@
#!/bin/bash
# Test for some known iconv(1) hangs from bug 19519, and miscellaneous
# iconv(1) program error conditions.
# Copyright (C) 2020 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.
codir=$1
test_wrapper_env="$2"
run_program_env="$3"
# We have to have some directories in the library path.
LIBPATH=$codir:$codir/iconvdata
# How the start the iconv(1) program. $from is not defined/expanded yet.
ICONV='
$codir/elf/ld.so --library-path $LIBPATH --inhibit-rpath ${from}.so
$codir/iconv/iconv_prog
'
ICONV="$test_wrapper_env $run_program_env $ICONV"
# List of known hangs;
# Gathered by running an exhaustive 2 byte input search against glibc-2.28
hangarray=(
"\x00\x23;-c;ANSI_X3.110;UTF-8//TRANSLIT//IGNORE"
"\x00\xa1;-c;ARMSCII-8;UTF-8//TRANSLIT//IGNORE"
"\x00\xa1;-c;ASMO_449;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;BIG5;UTF-8//TRANSLIT//IGNORE"
"\x00\xff;-c;BIG5HKSCS;UTF-8//TRANSLIT//IGNORE"
"\x00\xff;-c;BRF;UTF-8//TRANSLIT//IGNORE"
"\x00\xff;-c;BS_4730;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;CP1250;UTF-8//TRANSLIT//IGNORE"
"\x00\x98;-c;CP1251;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;CP1252;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;CP1253;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;CP1254;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;CP1255;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;CP1257;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;CP1258;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;CP932;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;CSA_Z243.4-1985-1;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;CSA_Z243.4-1985-2;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;DEC-MCS;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;DIN_66003;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;DS_2089;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-AT-DE;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-AT-DE-A;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-CA-FR;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-DK-NO;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-DK-NO-A;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-ES;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-ES-A;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-ES-S;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-FI-SE;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-FI-SE-A;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-FR;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-IS-FRISS;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-IT;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-PT;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-UK;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;EBCDIC-US;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ES;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ES2;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;EUC-CN;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;EUC-JISX0213;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;EUC-JP;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;EUC-JP-MS;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;EUC-KR;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;EUC-TW;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;GB18030;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;GB_1988-80;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;GBK;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;GOST_19768-74;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;GREEK7;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;GREEK7-OLD;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;GREEK-CCITT;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;HP-GREEK8;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;HP-ROMAN8;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;HP-ROMAN9;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;HP-THAI8;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;HP-TURKISH8;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM038;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;IBM1004;UTF-8//TRANSLIT//IGNORE"
"\x00\xff;-c;IBM1008;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;IBM1046;UTF-8//TRANSLIT//IGNORE"
"\x00\x51;-c;IBM1132;UTF-8//TRANSLIT//IGNORE"
"\x00\xa0;-c;IBM1133;UTF-8//TRANSLIT//IGNORE"
"\x00\xce;-c;IBM1137;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;IBM1161;UTF-8//TRANSLIT//IGNORE"
"\x00\xdb;-c;IBM1162;UTF-8//TRANSLIT//IGNORE"
"\x00\x70;-c;IBM12712;UTF-8//TRANSLIT//IGNORE"
# These are known hangs that are yet to be fixed:
# "\x00\x0f;-c;IBM1364;UTF-8"
# "\x00\x0f;-c;IBM1371;UTF-8"
# "\x00\x0f;-c;IBM1388;UTF-8"
# "\x00\x0f;-c;IBM1390;UTF-8"
# "\x00\x0f;-c;IBM1399;UTF-8"
"\x00\x53;-c;IBM16804;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM274;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM275;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM281;UTF-8//TRANSLIT//IGNORE"
"\x00\x57;-c;IBM290;UTF-8//TRANSLIT//IGNORE"
"\x00\x45;-c;IBM420;UTF-8//TRANSLIT//IGNORE"
"\x00\x68;-c;IBM423;UTF-8//TRANSLIT//IGNORE"
"\x00\x70;-c;IBM424;UTF-8//TRANSLIT//IGNORE"
"\x00\x53;-c;IBM4517;UTF-8//TRANSLIT//IGNORE"
"\x00\x53;-c;IBM4899;UTF-8//TRANSLIT//IGNORE"
"\x00\xa5;-c;IBM4909;UTF-8//TRANSLIT//IGNORE"
"\x00\xdc;-c;IBM4971;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM803;UTF-8//TRANSLIT//IGNORE"
"\x00\x91;-c;IBM851;UTF-8//TRANSLIT//IGNORE"
"\x00\x9b;-c;IBM856;UTF-8//TRANSLIT//IGNORE"
"\x00\xd5;-c;IBM857;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;IBM864;UTF-8//TRANSLIT//IGNORE"
"\x00\x94;-c;IBM868;UTF-8//TRANSLIT//IGNORE"
"\x00\x94;-c;IBM869;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;IBM874;UTF-8//TRANSLIT//IGNORE"
"\x00\x6a;-c;IBM875;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM880;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;IBM891;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;IBM903;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;IBM904;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM905;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;IBM9066;UTF-8//TRANSLIT//IGNORE"
"\x00\x48;-c;IBM918;UTF-8//TRANSLIT//IGNORE"
"\x00\x57;-c;IBM930;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;IBM932;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM933;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM935;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM937;UTF-8//TRANSLIT//IGNORE"
"\x00\x41;-c;IBM939;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;IBM943;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;INIS;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;INIS-8;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;INIS-CYRILLIC;UTF-8//TRANSLIT//IGNORE"
"\x00\xec;-c;ISIRI-3342;UTF-8//TRANSLIT//IGNORE"
"\x00\xec;-c;ISO_10367-BOX;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-2022-CN;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-2022-CN-EXT;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-2022-JP;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-2022-JP-2;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-2022-JP-3;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-2022-KR;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO_2033;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO_5427;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO_5427-EXT;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO_5428;UTF-8//TRANSLIT//IGNORE"
"\x00\xa4;-c;ISO_6937;UTF-8//TRANSLIT//IGNORE"
"\x00\xa0;-c;ISO_6937-2;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-8859-11;UTF-8//TRANSLIT//IGNORE"
"\x00\xa5;-c;ISO-8859-3;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-8859-6;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-8859-7;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;ISO-8859-8;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;ISO-IR-197;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;ISO-IR-209;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;IT;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;JIS_C6220-1969-RO;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;JIS_C6229-1984-B;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;JOHAB;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;JUS_I.B1.002;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;KOI-8;UTF-8//TRANSLIT//IGNORE"
"\x00\x88;-c;KOI8-T;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;KSC5636;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;LATIN-GREEK;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;LATIN-GREEK-1;UTF-8//TRANSLIT//IGNORE"
"\x00\xf6;-c;MAC-IS;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;MSZ_7795.3;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;NATS-DANO;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;NATS-SEFI;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;NC_NC00-10;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;NF_Z_62-010;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;NF_Z_62-010_1973;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;NS_4551-1;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;NS_4551-2;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;PT;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;PT2;UTF-8//TRANSLIT//IGNORE"
"\x00\x98;-c;RK1048;UTF-8//TRANSLIT//IGNORE"
"\x00\x98;-c;SEN_850200_B;UTF-8//TRANSLIT//IGNORE"
"\x00\x98;-c;SEN_850200_C;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;Shift_JISX0213;UTF-8//TRANSLIT//IGNORE"
"\x00\x80;-c;SJIS;UTF-8//TRANSLIT//IGNORE"
"\x00\x23;-c;T.61-8BIT;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;TIS-620;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;TSCII;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;UHC;UTF-8//TRANSLIT//IGNORE"
"\x00\xd8;-c;UNICODE;UTF-8//TRANSLIT//IGNORE"
"\x00\xdc;-c;UTF-16;UTF-8//TRANSLIT//IGNORE"
"\xdc\x00;-c;UTF-16BE;UTF-8//TRANSLIT//IGNORE"
"\x00\xdc;-c;UTF-16LE;UTF-8//TRANSLIT//IGNORE"
"\xff\xff;-c;UTF-7;UTF-8//TRANSLIT//IGNORE"
"\x00\x81;-c;WIN-SAMI-2;UTF-8//TRANSLIT//IGNORE"
)
# List of option combinations that *should* lead to an error
errorarray=(
# Converting from/to invalid character sets should cause error
"\x00\x00;;INVALID;INVALID"
"\x00\x00;;INVALID;UTF-8"
"\x00\x00;;UTF-8;INVALID"
)
# Requires $twobyte input, $c flag, $from, and $to to be set; sets $ret
execute_test ()
{
eval PROG=\"$ICONV\"
echo -en "$twobyte" \
| timeout -k 4 3 $PROG $c -f $from -t "$to" &>/dev/null
ret=$?
}
check_hangtest_result ()
{
if [ "$ret" -eq "124" ] || [ "$ret" -eq "137" ]; then # timeout/hang
result="HANG"
else
if [ "$ret" -eq "139" ]; then # segfault
result="SEGFAULT"
else
if [ "$ret" -gt "127" ]; then # unexpected error
result="UNEXPECTED"
else
result="OK"
fi
fi
fi
echo -n "$result: from: \"$from\", to: \"$to\","
echo " input \"$twobyte\", flags \"$c\""
if [ "$result" != "OK" ]; then
exit 1
fi
}
for hangcommand in "${hangarray[@]}"; do
twobyte="$(echo "$hangcommand" | cut -d";" -f 1)"
c="$(echo "$hangcommand" | cut -d";" -f 2)"
from="$(echo "$hangcommand" | cut -d";" -f 3)"
to="$(echo "$hangcommand" | cut -d";" -f 4)"
execute_test
check_hangtest_result
done
check_errtest_result ()
{
if [ "$ret" -eq "1" ]; then # we errored out as expected
result="PASS"
else
result="FAIL"
fi
echo -n "$result: from: \"$from\", to: \"$to\","
echo " input \"$twobyte\", flags \"$c\", return code $ret"
if [ "$result" != "PASS" ]; then
exit 1
fi
}
for errorcommand in "${errorarray[@]}"; do
twobyte="$(echo "$errorcommand" | cut -d";" -f 1)"
c="$(echo "$errorcommand" | cut -d";" -f 2)"
from="$(echo "$errorcommand" | cut -d";" -f 3)"
to="$(echo "$errorcommand" | cut -d";" -f 4)"
execute_test
check_errtest_result
done

View File

@ -1119,11 +1119,16 @@ _nl_find_msg (struct loaded_l10nfile *domain_file,
outcharset = encoding;
# ifdef _LIBC
/* We always want to use transliteration. */
outcharset = norm_add_slashes (outcharset, "TRANSLIT");
charset = norm_add_slashes (charset, "");
int r = __gconv_open (outcharset, charset, &convd->conv,
GCONV_AVOID_NOCONV);
struct gconv_spec conv_spec
= { .fromcode = norm_add_slashes (charset, ""),
.tocode = norm_add_slashes (outcharset, ""),
/* We always want to use transliteration. */
.translit = true,
.ignore = false
};
int r = __gconv_open (&conv_spec, &convd->conv,
GCONV_AVOID_NOCONV);
if (__builtin_expect (r != __GCONV_OK, 0))
{
/* If the output encoding is the same there is