iconv: Input buffering for the iconv program (bug 6050)

Do not read the entire input file into memory. Reviewed-by: DJ Delorie <dj@redhat.com>
2024-11-21 12:30:06 +00:00 · 2024-09-20 13:10:54 +02:00 · 2024-09-20 13:10:54 +02:00 · fa1b0d5e9f
commit fa1b0d5e9f
parent 75819cdd29
2 changed files with 108 additions and 105 deletions
--- a/iconv/iconv_prog.c
+++ b/iconv/iconv_prog.c
@ -118,8 +118,9 @@ static size_t output_buffer_size = 1024 * 1024;
 /* Prototypes for the functions doing the actual work.  */
 static void prepare_output_file (char **argv);
-static void close_output_file (int status);
+static void close_output_file (__gconv_t cd, int status);
-static int process_block (iconv_t cd, char *addr, size_t len);
+static int process_block (iconv_t cd, char **addr, size_t *len,
 			  off64_t file_offset, bool *incomplete);
 static int process_fd (iconv_t cd, int fd);
 static int process_file (iconv_t cd, FILE *input);
 static void print_known_names (void);
@ -311,7 +312,7 @@ conversions from `%s' and to `%s' are not supported"),
 	status = EXIT_FAILURE;
      /* Close the output file now.  */
-      close_output_file (status);
+      close_output_file (cd, status);
    }
  return status;
@ -599,7 +600,7 @@ flush_output (void)
 }
 static void
-close_output_file (int status)
+close_output_file (__gconv_t cd, int status)
 {
  /* Do not perform a flush if a temporary file or the in-memory
     buffer is in use and there was an error.  It would clobber the
@ -608,10 +609,28 @@ close_output_file (int status)
      (output_using_temporary_file || output_fd < 0))
    return;
-  /* The current_input_file_index variable is now larger than
+  /* All the input text is processed.  For state-dependent character
-     last_overlapping_file_index, so the flush_output call switches
+     sets we have to flush the state now.
     The current_input_file_index variable is now larger than
     last_overlapping_file_index, so the flush_output calls switch
     away from the temporary file.  */
  size_t n = iconv (cd, NULL, NULL,
 		    &output_buffer_current, &output_buffer_remaining);
  if (n == (size_t) -1 && errno == E2BIG)
    {
      /* Try again if the state flush exceeded the buffer space.  */
      flush_output ();
      n = iconv (cd, NULL, NULL,
 		 &output_buffer_current, &output_buffer_remaining);
    }
  int saved_errno = errno;
  flush_output ();
  if (n == (size_t) -1 && !omit_invalid)
    {
      errno = saved_errno;
      output_error ();
    }
  if (output_fd == STDOUT_FILENO)
    {
@ -625,51 +644,35 @@ close_output_file (int status)
    output_error ();
 }
 /* CD is the iconv handle.  Input processing starts at *ADDR, and
   consumes upto *LEN bytes.  *ADDR and *LEN are updated.  FILE_OFFSET
   is the file offset of the data initially at ADDR.  *INCOMPLETE is
   set to true if conversion stops due to an incomplete input
   sequence.  */
 static int
-process_block (iconv_t cd, char *addr, size_t len)
+process_block (iconv_t cd, char **addr, size_t *len, off64_t file_offset,
 	       bool *incomplete)
 {
-  const char *start = addr;
+  const char *start = *addr;
  size_t n;
  int ret = 0;
-  while (len > 0)
+  while (*len > 0)
    {
-      n = iconv (cd, &addr, &len,
+      n = iconv (cd, addr, len,
 		 &output_buffer_current, &output_buffer_remaining);
      if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
 	{
 	  ret = 1;
-	  if (len == 0)
+	  if (*len == 0)
 	    n = 0;
 	  else
 	    errno = E2BIG;
 	}
      if (n != (size_t) -1)
-	{
+	break;
 	  /* All the input test is processed.  For state-dependent
 	     character sets we have to flush the state now.  */
 	  n = iconv (cd, NULL, NULL,
 		     &output_buffer_current, &output_buffer_remaining);
 	  if (n == (size_t) -1 && errno == E2BIG)
 	    {
 	      /* Try again if the state flush exceeded the buffer space.  */
 	      flush_output ();
 	      n = iconv (cd, NULL, NULL,
 			 &output_buffer_current, &output_buffer_remaining);
 	    }
 	  bool errno_is_EILSEQ = errno == EILSEQ;
 	  if (n != (size_t) -1)
 	    break;
 	  if (omit_invalid && errno_is_EILSEQ)
 	    {
 	      ret = 1;
 	      break;
 	    }
 	}
      if (errno == E2BIG)
 	flush_output ();
@ -680,13 +683,12 @@ process_block (iconv_t cd, char *addr, size_t len)
 	    {
 	    case EILSEQ:
 	      if (! omit_invalid)
-		error (0, 0, _("illegal input sequence at position %ld"),
+		error (0, 0, _("illegal input sequence at position %lld"),
-		       (long int) (addr - start));
+		       (long long int) (file_offset + (*addr - start)));
 	      break;
 	    case EINVAL:
-	      error (0, 0, _("\
+	      *incomplete = true;
-incomplete character or shift sequence at end of buffer"));
+	      return ret;
 	      break;
 	    case EBADF:
 	      error (0, 0, _("internal error (illegal descriptor)"));
 	      break;
@ -706,79 +708,49 @@ incomplete character or shift sequence at end of buffer"));
 static int
 process_fd (iconv_t cd, int fd)
 {
-  /* we have a problem with reading from a descriptor since we must not
+  char inbuf[BUFSIZ];
-     provide the iconv() function an incomplete character or shift
+  char *inbuf_end = inbuf + sizeof (inbuf);
-     sequence at the end of the buffer.  Since we have to deal with
+  size_t inbuf_used = 0;
-     arbitrary encodings we must read the whole text in a buffer and
+  off64_t file_offset = 0;
-     process it in one step.  */
+  int status = 0;
-  static char *inbuf = NULL;
+  bool incomplete = false;
  static size_t maxlen = 0;
  char *inptr = inbuf;
  size_t actlen = 0;
-  while (actlen < maxlen)
+  while (true)
    {
-      ssize_t n = read (fd, inptr, maxlen - actlen);
+      char *p = inbuf + inbuf_used;
-
+      ssize_t read_ret = read (fd, p, inbuf_end - p);
-      if (n == 0)
+      if (read_ret == 0)
-	/* No more text to read.  */
+	{
-	break;
+	  /* On EOF, check if the previous iconv invocation saw an
-
+	     incomplete sequence.  */
-      if (n == -1)
+	  if (incomplete)
 	    {
 	      error (0, 0, _("\
 incomplete character or shift sequence at end of buffer"));
 	      return 1;
 	    }
 	  return 0;
 	}
      if (read_ret < 0)
 	{
 	  /* Error while reading.  */
 	  error (0, errno, _("error while reading the input"));
 	  return -1;
 	}
-
+      inbuf_used += read_ret;
-      inptr += n;
+      incomplete = false;
-      actlen += n;
+      p = inbuf;
      int ret = process_block (cd, &p, &inbuf_used, file_offset, &incomplete);
      if (ret != 0)
 	{
 	  status = ret;
 	  if (ret < 0)
 	    break;
 	}
      /* The next loop iteration consumes the leftover bytes.  */
      memmove (inbuf, p, inbuf_used);
      file_offset += read_ret - inbuf_used;
    }
-
+  return status;
  if (actlen == maxlen)
    while (1)
      {
 	ssize_t n;
 	char *new_inbuf;
 	/* Increase the buffer.  */
 	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
 	if (new_inbuf == NULL)
 	  {
 	    error (0, errno, _("unable to allocate buffer for input"));
 	    return -1;
 	  }
 	inbuf = new_inbuf;
 	maxlen += 32768;
 	inptr = inbuf + actlen;
 	do
 	  {
 	    n = read (fd, inptr, maxlen - actlen);
 	    if (n == 0)
 	      /* No more text to read.  */
 	      break;
 	    if (n == -1)
 	      {
 		/* Error while reading.  */
 		error (0, errno, _("error while reading the input"));
 		return -1;
 	      }
 	    inptr += n;
 	    actlen += n;
 	  }
 	while (actlen < maxlen);
 	if (n == 0)
 	  /* Break again so we leave both loops.  */
 	  break;
      }
  /* Now we have all the input in the buffer.  Process it in one run.  */
  return process_block (cd, inbuf, actlen);
 }
--- a/iconv/tst-iconv_prog-buffer.sh
+++ b/iconv/tst-iconv_prog-buffer.sh
@ -50,6 +50,9 @@ echo OUT > "$tmp/out-template"
 : > "$tmp/empty"
 printf '\xff' > "$tmp/0xff"
 # Length should be a prime number, to help with buffer alignment testing.
 printf '\xc3\xa4\xe2\x80\x94\xe2\x80\x94\xc3\xa4\n' > "$tmp/utf8-sequence"
 # Double all files to produce larger buffers.
 for p in "$tmp"/* ; do
    i=0
@ -270,6 +273,34 @@ expect_exit 1 run_iconv -o "$tmp/out" "$tmp/abc" - < "$tmp/0xff" "$tmp/def"
 run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc"
 expect_files xy abc zt
 # NB: Extra iconv args are ignored after this point.  Actual
 # multi-byte conversion does not work with tiny buffers.
 iconv_args="-f UTF-8 -t ASCII"
 printf 'x\n\xc3' > "$tmp/incomplete"
 expect_exit 1 run_iconv -o "$tmp/out" "$tmp/incomplete"
 check_out <<EOF
 x
 EOF
 # Test buffering behavior if the buffer ends with an incomplete
 # multi-byte sequence.
 prefix=""
 prefix_length=0
 while test $prefix_length -lt 12; do
    echo "info: testing prefix length $prefix_length" 2>&$logfd
    printf "%s" "$prefix" > "$tmp/prefix"
    cat "$tmp/prefix" "$tmp/utf8-sequence" > "$tmp/tmp"
    iconv_args="-f UTF-8 -t UCS-4"
    run_iconv -o "$tmp/out1" "$tmp/tmp"
    iconv_args="-f UCS-4 -t UTF-8"
    run_iconv -o "$tmp/out" "$tmp/out1"
    expect_files prefix utf8-sequence
    prefix="$prefix@"
    prefix_length=$(($prefix_length + 1))
 done
 if $failure ; then
    exit 1
 fi