iconv: Input buffering for the iconv program (bug 6050)

Do not read the entire input file into memory. Reviewed-by: DJ Delorie <dj@redhat.com>
2024-11-21 12:30:06 +00:00 · 2024-09-20 13:10:54 +02:00 · 2024-09-20 13:10:54 +02:00 · fa1b0d5e9f
commit fa1b0d5e9f
parent 75819cdd29
2 changed files with 108 additions and 105 deletions
--- a/iconv/iconv_prog.c
+++ b/iconv/iconv_prog.c
@ -118,8 +118,9 @@ static size_t output_buffer_size = 1024 * 1024;

 /* Prototypes for the functions doing the actual work.  */
 static void prepare_output_file (char **argv);
-static void close_output_file (int status);
-static int process_block (iconv_t cd, char *addr, size_t len);
+static void close_output_file (__gconv_t cd, int status);
+static int process_block (iconv_t cd, char **addr, size_t *len,
+			  off64_t file_offset, bool *incomplete);
 static int process_fd (iconv_t cd, int fd);
 static int process_file (iconv_t cd, FILE *input);
 static void print_known_names (void);
@ -311,7 +312,7 @@ conversions from `%s' and to `%s' are not supported"),
 	status = EXIT_FAILURE;

      /* Close the output file now.  */
-      close_output_file (status);
+      close_output_file (cd, status);
    }

  return status;
@ -599,7 +600,7 @@ flush_output (void)
 }

 static void
-close_output_file (int status)
+close_output_file (__gconv_t cd, int status)
 {
  /* Do not perform a flush if a temporary file or the in-memory
     buffer is in use and there was an error.  It would clobber the
@ -608,10 +609,28 @@ close_output_file (int status)
      (output_using_temporary_file || output_fd < 0))
    return;

-  /* The current_input_file_index variable is now larger than
-     last_overlapping_file_index, so the flush_output call switches
+  /* All the input text is processed.  For state-dependent character
+     sets we have to flush the state now.
+
+     The current_input_file_index variable is now larger than
+     last_overlapping_file_index, so the flush_output calls switch
     away from the temporary file.  */
+  size_t n = iconv (cd, NULL, NULL,
+		    &output_buffer_current, &output_buffer_remaining);
+  if (n == (size_t) -1 && errno == E2BIG)
+    {
+      /* Try again if the state flush exceeded the buffer space.  */
      flush_output ();
+      n = iconv (cd, NULL, NULL,
+		 &output_buffer_current, &output_buffer_remaining);
+    }
+  int saved_errno = errno;
+  flush_output ();
+  if (n == (size_t) -1 && !omit_invalid)
+    {
+      errno = saved_errno;
+      output_error ();
+    }

  if (output_fd == STDOUT_FILENO)
    {
@ -625,52 +644,36 @@ close_output_file (int status)
    output_error ();
 }

+/* CD is the iconv handle.  Input processing starts at *ADDR, and
+   consumes upto *LEN bytes.  *ADDR and *LEN are updated.  FILE_OFFSET
+   is the file offset of the data initially at ADDR.  *INCOMPLETE is
+   set to true if conversion stops due to an incomplete input
+   sequence.  */
 static int
-process_block (iconv_t cd, char *addr, size_t len)
+process_block (iconv_t cd, char **addr, size_t *len, off64_t file_offset,
+	       bool *incomplete)
 {
-  const char *start = addr;
+  const char *start = *addr;
  size_t n;
  int ret = 0;

-  while (len > 0)
+  while (*len > 0)
    {
-      n = iconv (cd, &addr, &len,
+      n = iconv (cd, addr, len,
 		 &output_buffer_current, &output_buffer_remaining);

      if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
 	{
 	  ret = 1;
-	  if (len == 0)
+	  if (*len == 0)
 	    n = 0;
 	  else
 	    errno = E2BIG;
 	}

-      if (n != (size_t) -1)
-	{
-	  /* All the input test is processed.  For state-dependent
-	     character sets we have to flush the state now.  */
-	  n = iconv (cd, NULL, NULL,
-		     &output_buffer_current, &output_buffer_remaining);
-	  if (n == (size_t) -1 && errno == E2BIG)
-	    {
-	      /* Try again if the state flush exceeded the buffer space.  */
-	      flush_output ();
-	      n = iconv (cd, NULL, NULL,
-			 &output_buffer_current, &output_buffer_remaining);
-	    }
-	  bool errno_is_EILSEQ = errno == EILSEQ;
-
      if (n != (size_t) -1)
 	break;

-	  if (omit_invalid && errno_is_EILSEQ)
-	    {
-	      ret = 1;
-	      break;
-	    }
-	}
-
      if (errno == E2BIG)
 	flush_output ();
      else
@ -680,13 +683,12 @@ process_block (iconv_t cd, char *addr, size_t len)
 	    {
 	    case EILSEQ:
 	      if (! omit_invalid)
-		error (0, 0, _("illegal input sequence at position %ld"),
-		       (long int) (addr - start));
+		error (0, 0, _("illegal input sequence at position %lld"),
+		       (long long int) (file_offset + (*addr - start)));
 	      break;
 	    case EINVAL:
-	      error (0, 0, _("\
-incomplete character or shift sequence at end of buffer"));
-	      break;
+	      *incomplete = true;
+	      return ret;
 	    case EBADF:
 	      error (0, 0, _("internal error (illegal descriptor)"));
 	      break;
@ -706,79 +708,49 @@ incomplete character or shift sequence at end of buffer"));
 static int
 process_fd (iconv_t cd, int fd)
 {
-  /* we have a problem with reading from a descriptor since we must not
-     provide the iconv() function an incomplete character or shift
-     sequence at the end of the buffer.  Since we have to deal with
-     arbitrary encodings we must read the whole text in a buffer and
-     process it in one step.  */
-  static char *inbuf = NULL;
-  static size_t maxlen = 0;
-  char *inptr = inbuf;
-  size_t actlen = 0;
+  char inbuf[BUFSIZ];
+  char *inbuf_end = inbuf + sizeof (inbuf);
+  size_t inbuf_used = 0;
+  off64_t file_offset = 0;
+  int status = 0;
+  bool incomplete = false;

-  while (actlen < maxlen)
+  while (true)
    {
-      ssize_t n = read (fd, inptr, maxlen - actlen);
-
-      if (n == 0)
-	/* No more text to read.  */
-	break;
-
-      if (n == -1)
+      char *p = inbuf + inbuf_used;
+      ssize_t read_ret = read (fd, p, inbuf_end - p);
+      if (read_ret == 0)
+	{
+	  /* On EOF, check if the previous iconv invocation saw an
+	     incomplete sequence.  */
+	  if (incomplete)
+	    {
+	      error (0, 0, _("\
+incomplete character or shift sequence at end of buffer"));
+	      return 1;
+	    }
+	  return 0;
+	}
+      if (read_ret < 0)
 	{
-	  /* Error while reading.  */
 	  error (0, errno, _("error while reading the input"));
 	  return -1;
 	}
-
-      inptr += n;
-      actlen += n;
-    }
-
-  if (actlen == maxlen)
-    while (1)
+      inbuf_used += read_ret;
+      incomplete = false;
+      p = inbuf;
+      int ret = process_block (cd, &p, &inbuf_used, file_offset, &incomplete);
+      if (ret != 0)
 	{
-	ssize_t n;
-	char *new_inbuf;
-
-	/* Increase the buffer.  */
-	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
-	if (new_inbuf == NULL)
-	  {
-	    error (0, errno, _("unable to allocate buffer for input"));
-	    return -1;
-	  }
-	inbuf = new_inbuf;
-	maxlen += 32768;
-	inptr = inbuf + actlen;
-
-	do
-	  {
-	    n = read (fd, inptr, maxlen - actlen);
-
-	    if (n == 0)
-	      /* No more text to read.  */
-	      break;
-
-	    if (n == -1)
-	      {
-		/* Error while reading.  */
-		error (0, errno, _("error while reading the input"));
-		return -1;
-	      }
-
-	    inptr += n;
-	    actlen += n;
-	  }
-	while (actlen < maxlen);
-
-	if (n == 0)
-	  /* Break again so we leave both loops.  */
+	  status = ret;
+	  if (ret < 0)
 	    break;
 	}
-
-  /* Now we have all the input in the buffer.  Process it in one run.  */
-  return process_block (cd, inbuf, actlen);
+      /* The next loop iteration consumes the leftover bytes.  */
+      memmove (inbuf, p, inbuf_used);
+      file_offset += read_ret - inbuf_used;
+    }
+  return status;
 }


--- a/iconv/tst-iconv_prog-buffer.sh
+++ b/iconv/tst-iconv_prog-buffer.sh
@ -50,6 +50,9 @@ echo OUT > "$tmp/out-template"
 : > "$tmp/empty"
 printf '\xff' > "$tmp/0xff"

+# Length should be a prime number, to help with buffer alignment testing.
+printf '\xc3\xa4\xe2\x80\x94\xe2\x80\x94\xc3\xa4\n' > "$tmp/utf8-sequence"
+
 # Double all files to produce larger buffers.
 for p in "$tmp"/* ; do
    i=0
@ -270,6 +273,34 @@ expect_exit 1 run_iconv -o "$tmp/out" "$tmp/abc" - < "$tmp/0xff" "$tmp/def"
 run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc"
 expect_files xy abc zt

+# NB: Extra iconv args are ignored after this point.  Actual
+# multi-byte conversion does not work with tiny buffers.
+iconv_args="-f UTF-8 -t ASCII"
+
+printf 'x\n\xc3' > "$tmp/incomplete"
+expect_exit 1 run_iconv -o "$tmp/out" "$tmp/incomplete"
+check_out <<EOF
+x
+EOF
+
+# Test buffering behavior if the buffer ends with an incomplete
+# multi-byte sequence.
+prefix=""
+prefix_length=0
+while test $prefix_length -lt 12; do
+    echo "info: testing prefix length $prefix_length" 2>&$logfd
+    printf "%s" "$prefix" > "$tmp/prefix"
+    cat "$tmp/prefix" "$tmp/utf8-sequence" > "$tmp/tmp"
+    iconv_args="-f UTF-8 -t UCS-4"
+    run_iconv -o "$tmp/out1" "$tmp/tmp"
+    iconv_args="-f UCS-4 -t UTF-8"
+    run_iconv -o "$tmp/out" "$tmp/out1"
+    expect_files prefix utf8-sequence
+
+    prefix="$prefix@"
+    prefix_length=$(($prefix_length + 1))
+done
+
 if $failure ; then
    exit 1
 fi