ICU-1680 Fix crash with transliterators that were exceeding the

buffer size. Also, improve speed a bit. Finally, add a -b, --block-size size option, which not only is useful for Real People(tm) wanting a bigger buffer size, but also great for testing that everything runs smoothly by using -b 1 for example. X-SVN-Rev: 7548
2002-01-31 22:44:42 +00:00 · 2002-01-31 22:44:42 +00:00 · f56fb8ddba
commit f56fb8ddba
parent 2918d7d229
3 changed files with 157 additions and 114 deletions
--- a/icu4c/source/extra/uconv/root.txt
+++ b/icu4c/source/extra/uconv/root.txt
@ -1,6 +1,6 @@
 // -*- Coding: utf-8; -*-  [all uconv resource files]
 // Copyright (c) 2000 IBM, Inc. and Others.
-// $Revision: 1.19 $
+// $Revision: 1.20 $
 //
 // Root translation file for uconv messages.
 // So you want to translate this file??? Great!
@ -28,7 +28,7 @@ root

  lcUsageWord { "usage" }
  ucUsageWord { "Usage" }
-  usage { "{0}: {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] [ --fallback | --no-fallback ] -f, --from-code code -t, --to-code code [ file ... ] [ -o, --output file ]\n" }
+  usage { "{0}: {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] [ --fallback | --no-fallback ] [ -b, --block-size size ] -f, --from-code code -t, --to-code code [ file ... ] [ -o, --output file ]\n" }

  help {  "Options:  -h, --help                    print this message\n"
          "          -V, --version                 print the program version\n"
@ -45,6 +45,7 @@ root
 	  "          --from-callback callback      use callback on original encoding\n"
 	  "          -i                            ignore invalid sequences in the input\n"
          "          --callback callback           use callback on both encodings\n"
+	  "          -b, --block-size size         read size bytes blocks (default: 4096)"
 	  "          --fallback                    use fallback mapping\n"
 	  "          --no-fallback                 do not use fallback mapping\n"
 	  "          -f, --from-code code          set the original encoding\n"
@ -61,6 +62,8 @@ root
  noFromCodeset  { "No conversion from encoding given (use -f)\n" }
  noToCodeset    {  "No conversion to encoding given (use -t)\n"  }

+  badBlockSize	{ "Bad block size: {0}.\n" } // 0: size of the block
+
  cantOpenFromCodeset { "Couldn''t open from encoding {0}: {1}\n" }  // 0:set, 1: err
  cantOpenToCodeset { "Couldn''t open to encoding {0}: {1}\n" } // 0: set, 1: err

--- a/icu4c/source/extra/uconv/uconv.1.in
+++ b/icu4c/source/extra/uconv/uconv.1.in
@ -57,6 +57,9 @@
 |
 .BI "\-\-no\-fallback"
 ]
+[
+.BI "\-b\fP, \fB\-\-block\-size" " size"
+]
 .BI "\-f\fP, \fB\-\-from\-code" " encoding"
 .BI "\-t\fP, \fB\-\-to\-code" " encoding"
 [
@ -184,6 +187,12 @@ Do not use the fallback mapping when transcoding from Unicode to the
 destination encoding.
 This is the default.
 .TP
+.BI "\-b\fP, \fB\-\-block\-size" " size"
+Read input in blocks of
+.I size
+bytes at a time. The default block size is
+4096.
+.TP
 .BI "\-f\fP, \fB\-\-from\-code" " encoding"
 Set the original encoding of the data to 
 .IR encoding .
--- a/icu4c/source/extra/uconv/uconv.cpp
+++ b/icu4c/source/extra/uconv/uconv.cpp
@ -35,7 +35,7 @@
 #include <fcntl.h>
 #endif

-static const size_t buffsize = 4096;	/* Size of conversion buffer. */
+#define DEFAULT_BUFSZ	4096

 static UResourceBundle *gBundle = 0;	/* Bundle containing messages. */

@ -369,35 +369,31 @@ static UBool convertFile(const char *pname,
 			 UConverterFromUCallback fromucallback,
 			 const void *fromuctxt,
 			 int fallback,
+                         size_t bufsz,
 			 const char *translit,
 			 const char *infilestr,
-			 FILE * outfile, int verbose)
-{
+			 FILE * outfile, int verbose) {
    FILE *infile;
    UBool ret = TRUE;
    UConverter *convfrom = 0;
    UConverter *convto = 0;
    UErrorCode err = U_ZERO_ERROR;
    UBool flush;
-    const char *cbuffiter;
-    char *buffiter;
-    const size_t readsize = buffsize - 1;
-    char *buff = 0;
+    const char *cbufp;
+    char *bufp;
+    char *buf = 0;

    uint32_t foffset = 0;	/* Where we are in the file, for error reporting. */

-    UConverterFromUCallback oldfromucallback;
-    UConverterToUCallback oldtoucallback;
-    const void *oldcontext;
-
-    const UChar *cuniiter;
-    UChar *uniiter;
-    UChar *unibuff = 0;
+    const UChar *unibufbp;
+    UChar *unibufp;
+    UChar *unibuf = 0;
    int32_t *fromoffsets = 0, *tooffsets = 0;

-    size_t rd, totbuffsize;
+    size_t rd, tobufsz;

-    Transliterator *t = NULL;
+    Transliterator *t = 0;      // Transliterator acting on Unicode data.
+    UnicodeString u;            // String to do the transliteration.

    // Open the correct input file or connect to stdin for reading input

@ -424,6 +420,7 @@ static UBool convertFile(const char *pname,
    if (verbose) {
 	fprintf(stderr, "%s:\n", infilestr);
    }
+
    // Create transliterator as needed.

    if (translit != NULL && *translit) {
@ -441,6 +438,7 @@ static UBool convertFile(const char *pname,
 	    goto error_exit;
 	}
    }
+
    // Create codepage converter. If the codepage or its aliases weren't
    // available, it returns NULL and a failure code. We also set the
    // callbacks, and return errors in the same way.
@ -453,8 +451,7 @@ static UBool convertFile(const char *pname,
 	       u_wmsg_errorName(err));
 	goto error_exit;
    }
-    ucnv_setToUCallBack(convfrom, toucallback, touctxt, &oldtoucallback,
-			&oldcontext, &err);
+    ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
    if (U_FAILURE(err)) {
 	initMsg(pname);
 	u_wmsg("cantSetCallback", u_wmsg_errorName(err));
@ -469,8 +466,7 @@ static UBool convertFile(const char *pname,
 	       u_wmsg_errorName(err));
 	goto error_exit;
    }
-    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt,
-			  &oldfromucallback, &oldcontext, &err);
+    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
    if (U_FAILURE(err)) {
 	initMsg(pname);
 	u_wmsg("cantSetCallback", u_wmsg_errorName(err));
@ -479,20 +475,21 @@ static UBool convertFile(const char *pname,
    ucnv_setFallback(convto, fallback);

    // To ensure that the buffer always is of enough size, we
-    // must take the worst case scenario, that is the character in the codepage
-    // that uses the most bytes and multiply it against the buffsize
+    // must take the worst case scenario, that is the character in
+    // the codepage that uses the most bytes and multiply it against
+    // the buffer size.

-    totbuffsize = buffsize * ucnv_getMaxCharSize(convto);
-    buff = new char[totbuffsize];
-    unibuff = new UChar[buffsize];
+    tobufsz = bufsz * ucnv_getMaxCharSize(convto);
+    buf = new char[tobufsz];
+    unibuf = new UChar[bufsz];

-    fromoffsets = new int32_t[buffsize];
-    tooffsets = new int32_t[totbuffsize];
+    fromoffsets = new int32_t[bufsz];
+    tooffsets = new int32_t[tobufsz];

    // OK, we can convert now.

    do {
-	rd = fread(buff, 1, readsize, infile);
+	rd = fread(buf, 1, bufsz, infile);
 	if (ferror(infile) != 0) {
 	    UnicodeString str(strerror(errno));
 	    str.append((UChar32) 0);
@ -500,22 +497,25 @@ static UBool convertFile(const char *pname,
 	    u_wmsg("cantRead", str.getBuffer());
 	    goto error_exit;
 	}
-	// Convert the read buffer into the new coding
-	// After the call 'uniiter' will be placed on the last character that was converted
-	// in the 'unibuff'. 
-	// Also the 'cbuffiter' is positioned on the last converted character.
-	// At the last conversion in the file, flush should be set to true so that
-	// we get all characters converted
-	//
-	// The converter must be flushed at the end of conversion so that characters
-	// on hold also will be written
-	uniiter = unibuff;
-	cbuffiter = buff;
-	flush = rd != readsize;
-	ucnv_toUnicode(convfrom, &uniiter, uniiter + buffsize, &cbuffiter,
-		       cbuffiter + rd, fromoffsets, flush, &err);

-	foffset += cbuffiter - buff;
+	// Convert the read buffer into the new coding
+	// After the call 'unibufp' will be placed on the last
+        // character that was converted in the 'unibuf'. 
+	// Also the 'cbufp' is positioned on the last converted
+        // character.
+	// At the last conversion in the file, flush should be set to
+        // true so that we get all characters converted
+	//
+	// The converter must be flushed at the end of conversion so
+        // that characters on hold also will be written.
+
+	unibufp = unibuf;
+	cbufp = buf;
+	flush = rd != bufsz;
+	ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp,
+		       cbufp + rd, fromoffsets, flush, &err);
+
+	foffset += cbufp - buf;

 	if (U_FAILURE(err)) {
 	    char pos[32];
@ -526,9 +526,11 @@ static UBool convertFile(const char *pname,
 		   u_wmsg_errorName(err));
 	    goto error_exit;
 	}
-	// At the last conversion, the converted characters should be equal to number
-	// of chars read.
-	if (flush && cbuffiter != (buff + rd)) {
+
+	// At the last conversion, the converted characters should be
+        // equal to number of chars read.
+
+	if (flush && cbufp != (buf + rd)) {
 	    char pos[32];
 	    sprintf(pos, "%u", foffset);
 	    UnicodeString str(pos, strlen(pos) + 1);
@ -536,87 +538,99 @@ static UBool convertFile(const char *pname,
 	    u_wmsg("premEndInput", str.getBuffer());
 	    goto error_exit;
 	}
-	// Convert the Unicode buffer into the destination codepage
-	// Again 'buffiter' will be placed on the last converted character
-	// And 'cuniiter' will be placed on the last converted unicode character
-	// At the last conversion flush should be set to true to ensure that 
-	// all characters left get converted

-	UnicodeString u(unibuff, uniiter - unibuff);
-	buffiter = buff;
-	cuniiter = unibuff;
+        // Prepare to transliterate and convert.
+
+        if (t) {
+            u.setTo(unibuf, unibufp - unibuf); // Copy into string.
+        } else {
+            u.setTo(unibuf, unibufp - unibuf, bufsz); // Share the buffer.
+        }
+
+        // Transliterate if needed.

 	if (t) {
 	    t->transliterate(u);
-	    u.extract(0, u.length(), unibuff, 0);
-	    uniiter = unibuff + u.length();
-
 	}

-	ucnv_fromUnicode(convto, &buffiter, buffiter + totbuffsize,
-			 &cuniiter,
-			 cuniiter + (size_t) (uniiter - unibuff),
-			 tooffsets, flush, &err);
+        int32_t ulen = u.length();

-	if (U_FAILURE(err)) {
-	    char pos[32];
+	// Convert the Unicode buffer into the destination codepage
+	// Again 'bufp' will be placed on the last converted character
+	// And 'unibufbp' will be placed on the last converted unicode character
+	// At the last conversion flush should be set to true to ensure that 
+	// all characters left get converted

-	    uint32_t erroffset =
-		dataOffset(fromoffsets, buffiter - buff, tooffsets);
+        unibufbp = u.getBuffer();

-	    sprintf(pos, "%u", foffset - (uniiter - unibuff) + erroffset);
-	    UnicodeString str(pos, strlen(pos) + 1);
-	    initMsg(pname);
-	    u_wmsg("problemCvtFromU", str.getBuffer(),
-		   u_wmsg_errorName(err));
-	    goto error_exit;
-	}
-	// At the last conversion, the converted characters should be equal to number
-	// of consumed characters.
-	if (flush && cuniiter != (unibuff + (size_t) (uniiter - unibuff))) {
-	    char pos[32];
-	    sprintf(pos, "%u", foffset);
-	    UnicodeString str(pos, strlen(pos) + 1);
-	    initMsg(pname);
-	    u_wmsg("premEnd", str.getBuffer());
-	    goto error_exit;
-	}
-	// Finally, write the converted buffer to the output file
-	rd = (size_t) (buffiter - buff);
-	if (fwrite(buff, 1, rd, outfile) != rd) {
-	    UnicodeString str(strerror(errno), "");
-	    initMsg(pname);
-	    u_wmsg("cantWrite", str.getBuffer());
-	    goto error_exit;
-	}
+        do {
+            int32_t len = ulen > bufsz ? bufsz : ulen;

-    } while (!flush);		// Stop when we have flushed the converters (this means that it's the end of output)
+            bufp = buf;
+	    unibufp = (UChar *) (unibufbp + len);
+
+            ucnv_fromUnicode(convto, &bufp, bufp + tobufsz,
+                             &unibufbp,
+                             unibufp,
+                             tooffsets, flush, &err);
+            
+            if (U_FAILURE(err)) {
+                char pos[32];
+                
+                uint32_t erroffset =
+                    dataOffset(fromoffsets, bufp - buf, tooffsets);
+                
+                sprintf(pos, "%u", foffset - (unibufp - unibuf) + erroffset);
+                UnicodeString str(pos, strlen(pos) + 1);
+                initMsg(pname);
+                u_wmsg("problemCvtFromU", str.getBuffer(),
+                       u_wmsg_errorName(err));
+                goto error_exit;
+            }
+
+            // At the last conversion, the converted characters should be equal to number
+            // of consumed characters.
+            if (flush && unibufbp != (unibuf + (size_t) (unibufp - unibuf))) {
+                char pos[32];
+                sprintf(pos, "%u", foffset);
+                UnicodeString str(pos, strlen(pos) + 1);
+                initMsg(pname);
+                u_wmsg("premEnd", str.getBuffer());
+                goto error_exit;
+            }
+            
+            // Finally, write the converted buffer to the output file
+            
+            rd = (size_t) (bufp - buf);
+            if (fwrite(buf, 1, rd, outfile) != rd) {
+                UnicodeString str(strerror(errno), "");
+                initMsg(pname);
+                u_wmsg("cantWrite", str.getBuffer());
+                goto error_exit;
+            }
+        } while ((ulen -= bufsz) > 0);
+    } while (!flush);		// Stop when we have flushed the
+                                // converters (this means that it's
+                                // the end of output) 

    goto normal_exit;

- error_exit:
+error_exit:
    ret = FALSE;

- normal_exit:
-    // Close the created converters
+normal_exit:
+    // Cleanup.

-    if (convfrom)
-	ucnv_close(convfrom);
-    if (convto)
-	ucnv_close(convto);
+    if (convfrom) ucnv_close(convfrom);
+    if (convto)	ucnv_close(convto);

-    if (t)
-	delete t;
+    if (t) delete t;

-    if (buff)
-	delete[]buff;
-    if (unibuff)
-	delete[]unibuff;
+    if (buf) delete[] buf;
+    if (unibuf)	delete[] unibuf;

-    if (fromoffsets)
-	delete[]fromoffsets;
-    if (tooffsets)
-	delete[]tooffsets;
+    if (fromoffsets) delete[] fromoffsets;
+    if (tooffsets) delete[] tooffsets;

    if (infile != stdin) {
 	fclose(infile);
@ -625,8 +639,7 @@ static UBool convertFile(const char *pname,
    return ret;
 }

-static void usage(const char *pname, int ecode)
-{
+static void usage(const char *pname, int ecode) {
    const UChar *msg;
    int32_t msgLen;
    UErrorCode err = U_ZERO_ERROR;
@ -662,6 +675,8 @@ int main(int argc, char **argv)
    int ret = 0;
    int seenf = 0;

+    size_t bufsz = DEFAULT_BUFSZ;
+
    const char *fromcpage = 0;
    const char *tocpage = 0;
    const char *translit = 0;
@ -716,6 +731,20 @@ int main(int argc, char **argv)
 	    fallback = 1;
 	} else if (!strcmp("--no-fallback", *iter)) {
 	    fallback = 0;
+	} else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
+	    iter++;
+	    if (iter != end) {
+		bufsz = atoi(*iter);
+                if ((int) bufsz <= 0) {
+                    initMsg(pname);
+		    UnicodeString str(*iter);
+		    initMsg(pname);
+		    u_wmsg("badBlockSize", str.getBuffer());
+		    return 3;
+                }
+            } else {
+		usage(pname, 1);
+            }
 	} else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
 	    if (printTranslits) {
 		usage(pname, 1);
@ -885,6 +914,8 @@ int main(int argc, char **argv)
 	    ;
 	} else if (!strcmp("--no-fallback", *iter)) {
 	    ;
+	} else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
+	    iter++;
 	} else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
 	    ;
 	} else if (strcmp("--default-code", *iter) == 0) {
@ -921,7 +952,7 @@ int main(int argc, char **argv)
 	    seenf = 1;
 	    if (!convertFile
 		(pname, fromcpage, toucallback, touctxt, tocpage,
-		 fromucallback, fromuctxt, fallback, translit, *iter,
+		 fromucallback, fromuctxt, fallback, bufsz, translit, *iter,
 		 outfile, verbose)) {
 		goto error_exit;
 	    }
@ -931,7 +962,7 @@ int main(int argc, char **argv)
    if (!seenf) {
 	if (!convertFile
 	    (pname, fromcpage, toucallback, touctxt, tocpage,
-	     fromucallback, fromuctxt, fallback, translit, 0, outfile,
+	     fromucallback, fromuctxt, fallback, bufsz, translit, 0, outfile,
 	     verbose)) {
 	    goto error_exit;
 	}