iconv: Input buffering for the iconv program (bug 6050)

Do not read the entire input file into memory.

Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
Florian Weimer 2024-09-20 13:10:54 +02:00
parent 75819cdd29
commit fa1b0d5e9f
2 changed files with 108 additions and 105 deletions

View File

@ -118,8 +118,9 @@ static size_t output_buffer_size = 1024 * 1024;
/* Prototypes for the functions doing the actual work. */
static void prepare_output_file (char **argv);
static void close_output_file (int status);
static int process_block (iconv_t cd, char *addr, size_t len);
static void close_output_file (__gconv_t cd, int status);
static int process_block (iconv_t cd, char **addr, size_t *len,
off64_t file_offset, bool *incomplete);
static int process_fd (iconv_t cd, int fd);
static int process_file (iconv_t cd, FILE *input);
static void print_known_names (void);
@ -311,7 +312,7 @@ conversions from `%s' and to `%s' are not supported"),
status = EXIT_FAILURE;
/* Close the output file now. */
close_output_file (status);
close_output_file (cd, status);
}
return status;
@ -599,7 +600,7 @@ flush_output (void)
}
static void
close_output_file (int status)
close_output_file (__gconv_t cd, int status)
{
/* Do not perform a flush if a temporary file or the in-memory
buffer is in use and there was an error. It would clobber the
@ -608,10 +609,28 @@ close_output_file (int status)
(output_using_temporary_file || output_fd < 0))
return;
/* The current_input_file_index variable is now larger than
last_overlapping_file_index, so the flush_output call switches
/* All the input text is processed. For state-dependent character
sets we have to flush the state now.
The current_input_file_index variable is now larger than
last_overlapping_file_index, so the flush_output calls switch
away from the temporary file. */
size_t n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
if (n == (size_t) -1 && errno == E2BIG)
{
/* Try again if the state flush exceeded the buffer space. */
flush_output ();
n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
}
int saved_errno = errno;
flush_output ();
if (n == (size_t) -1 && !omit_invalid)
{
errno = saved_errno;
output_error ();
}
if (output_fd == STDOUT_FILENO)
{
@ -625,52 +644,36 @@ close_output_file (int status)
output_error ();
}
/* CD is the iconv handle. Input processing starts at *ADDR, and
consumes upto *LEN bytes. *ADDR and *LEN are updated. FILE_OFFSET
is the file offset of the data initially at ADDR. *INCOMPLETE is
set to true if conversion stops due to an incomplete input
sequence. */
static int
process_block (iconv_t cd, char *addr, size_t len)
process_block (iconv_t cd, char **addr, size_t *len, off64_t file_offset,
bool *incomplete)
{
const char *start = addr;
const char *start = *addr;
size_t n;
int ret = 0;
while (len > 0)
while (*len > 0)
{
n = iconv (cd, &addr, &len,
n = iconv (cd, addr, len,
&output_buffer_current, &output_buffer_remaining);
if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
{
ret = 1;
if (len == 0)
if (*len == 0)
n = 0;
else
errno = E2BIG;
}
if (n != (size_t) -1)
{
/* All the input test is processed. For state-dependent
character sets we have to flush the state now. */
n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
if (n == (size_t) -1 && errno == E2BIG)
{
/* Try again if the state flush exceeded the buffer space. */
flush_output ();
n = iconv (cd, NULL, NULL,
&output_buffer_current, &output_buffer_remaining);
}
bool errno_is_EILSEQ = errno == EILSEQ;
if (n != (size_t) -1)
break;
if (omit_invalid && errno_is_EILSEQ)
{
ret = 1;
break;
}
}
if (errno == E2BIG)
flush_output ();
else
@ -680,13 +683,12 @@ process_block (iconv_t cd, char *addr, size_t len)
{
case EILSEQ:
if (! omit_invalid)
error (0, 0, _("illegal input sequence at position %ld"),
(long int) (addr - start));
error (0, 0, _("illegal input sequence at position %lld"),
(long long int) (file_offset + (*addr - start)));
break;
case EINVAL:
error (0, 0, _("\
incomplete character or shift sequence at end of buffer"));
break;
*incomplete = true;
return ret;
case EBADF:
error (0, 0, _("internal error (illegal descriptor)"));
break;
@ -706,79 +708,49 @@ incomplete character or shift sequence at end of buffer"));
static int
process_fd (iconv_t cd, int fd)
{
/* we have a problem with reading from a descriptor since we must not
provide the iconv() function an incomplete character or shift
sequence at the end of the buffer. Since we have to deal with
arbitrary encodings we must read the whole text in a buffer and
process it in one step. */
static char *inbuf = NULL;
static size_t maxlen = 0;
char *inptr = inbuf;
size_t actlen = 0;
char inbuf[BUFSIZ];
char *inbuf_end = inbuf + sizeof (inbuf);
size_t inbuf_used = 0;
off64_t file_offset = 0;
int status = 0;
bool incomplete = false;
while (actlen < maxlen)
while (true)
{
ssize_t n = read (fd, inptr, maxlen - actlen);
if (n == 0)
/* No more text to read. */
break;
if (n == -1)
char *p = inbuf + inbuf_used;
ssize_t read_ret = read (fd, p, inbuf_end - p);
if (read_ret == 0)
{
/* On EOF, check if the previous iconv invocation saw an
incomplete sequence. */
if (incomplete)
{
error (0, 0, _("\
incomplete character or shift sequence at end of buffer"));
return 1;
}
return 0;
}
if (read_ret < 0)
{
/* Error while reading. */
error (0, errno, _("error while reading the input"));
return -1;
}
inptr += n;
actlen += n;
}
if (actlen == maxlen)
while (1)
inbuf_used += read_ret;
incomplete = false;
p = inbuf;
int ret = process_block (cd, &p, &inbuf_used, file_offset, &incomplete);
if (ret != 0)
{
ssize_t n;
char *new_inbuf;
/* Increase the buffer. */
new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
if (new_inbuf == NULL)
{
error (0, errno, _("unable to allocate buffer for input"));
return -1;
}
inbuf = new_inbuf;
maxlen += 32768;
inptr = inbuf + actlen;
do
{
n = read (fd, inptr, maxlen - actlen);
if (n == 0)
/* No more text to read. */
break;
if (n == -1)
{
/* Error while reading. */
error (0, errno, _("error while reading the input"));
return -1;
}
inptr += n;
actlen += n;
}
while (actlen < maxlen);
if (n == 0)
/* Break again so we leave both loops. */
status = ret;
if (ret < 0)
break;
}
/* Now we have all the input in the buffer. Process it in one run. */
return process_block (cd, inbuf, actlen);
/* The next loop iteration consumes the leftover bytes. */
memmove (inbuf, p, inbuf_used);
file_offset += read_ret - inbuf_used;
}
return status;
}

View File

@ -50,6 +50,9 @@ echo OUT > "$tmp/out-template"
: > "$tmp/empty"
printf '\xff' > "$tmp/0xff"
# Length should be a prime number, to help with buffer alignment testing.
printf '\xc3\xa4\xe2\x80\x94\xe2\x80\x94\xc3\xa4\n' > "$tmp/utf8-sequence"
# Double all files to produce larger buffers.
for p in "$tmp"/* ; do
i=0
@ -270,6 +273,34 @@ expect_exit 1 run_iconv -o "$tmp/out" "$tmp/abc" - < "$tmp/0xff" "$tmp/def"
run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc"
expect_files xy abc zt
# NB: Extra iconv args are ignored after this point. Actual
# multi-byte conversion does not work with tiny buffers.
iconv_args="-f UTF-8 -t ASCII"
printf 'x\n\xc3' > "$tmp/incomplete"
expect_exit 1 run_iconv -o "$tmp/out" "$tmp/incomplete"
check_out <<EOF
x
EOF
# Test buffering behavior if the buffer ends with an incomplete
# multi-byte sequence.
prefix=""
prefix_length=0
while test $prefix_length -lt 12; do
echo "info: testing prefix length $prefix_length" 2>&$logfd
printf "%s" "$prefix" > "$tmp/prefix"
cat "$tmp/prefix" "$tmp/utf8-sequence" > "$tmp/tmp"
iconv_args="-f UTF-8 -t UCS-4"
run_iconv -o "$tmp/out1" "$tmp/tmp"
iconv_args="-f UCS-4 -t UTF-8"
run_iconv -o "$tmp/out" "$tmp/out1"
expect_files prefix utf8-sequence
prefix="$prefix@"
prefix_length=$(($prefix_length + 1))
done
if $failure ; then
exit 1
fi