ICU-1680 Fix crash with transliterators that were exceeding the
buffer size. Also, improve speed a bit. Finally, add a -b, --block-size size option, which not only is useful for Real People(tm) wanting a bigger buffer size, but also great for testing that everything runs smoothly by using -b 1 for example. X-SVN-Rev: 7548
This commit is contained in:
parent
2918d7d229
commit
f56fb8ddba
@ -1,6 +1,6 @@
|
||||
// -*- Coding: utf-8; -*- [all uconv resource files]
|
||||
// Copyright (c) 2000 IBM, Inc. and Others.
|
||||
// $Revision: 1.19 $
|
||||
// $Revision: 1.20 $
|
||||
//
|
||||
// Root translation file for uconv messages.
|
||||
// So you want to translate this file??? Great!
|
||||
@ -28,7 +28,7 @@ root
|
||||
|
||||
lcUsageWord { "usage" }
|
||||
ucUsageWord { "Usage" }
|
||||
usage { "{0}: {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] [ --fallback | --no-fallback ] -f, --from-code code -t, --to-code code [ file ... ] [ -o, --output file ]\n" }
|
||||
usage { "{0}: {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] [ --fallback | --no-fallback ] [ -b, --block-size size ] -f, --from-code code -t, --to-code code [ file ... ] [ -o, --output file ]\n" }
|
||||
|
||||
help { "Options: -h, --help print this message\n"
|
||||
" -V, --version print the program version\n"
|
||||
@ -45,6 +45,7 @@ root
|
||||
" --from-callback callback use callback on original encoding\n"
|
||||
" -i ignore invalid sequences in the input\n"
|
||||
" --callback callback use callback on both encodings\n"
|
||||
" -b, --block-size size read size bytes blocks (default: 4096)"
|
||||
" --fallback use fallback mapping\n"
|
||||
" --no-fallback do not use fallback mapping\n"
|
||||
" -f, --from-code code set the original encoding\n"
|
||||
@ -61,6 +62,8 @@ root
|
||||
noFromCodeset { "No conversion from encoding given (use -f)\n" }
|
||||
noToCodeset { "No conversion to encoding given (use -t)\n" }
|
||||
|
||||
badBlockSize { "Bad block size: {0}.\n" } // 0: size of the block
|
||||
|
||||
cantOpenFromCodeset { "Couldn''t open from encoding {0}: {1}\n" } // 0:set, 1: err
|
||||
cantOpenToCodeset { "Couldn''t open to encoding {0}: {1}\n" } // 0: set, 1: err
|
||||
|
||||
|
@ -57,6 +57,9 @@
|
||||
|
|
||||
.BI "\-\-no\-fallback"
|
||||
]
|
||||
[
|
||||
.BI "\-b\fP, \fB\-\-block\-size" " size"
|
||||
]
|
||||
.BI "\-f\fP, \fB\-\-from\-code" " encoding"
|
||||
.BI "\-t\fP, \fB\-\-to\-code" " encoding"
|
||||
[
|
||||
@ -184,6 +187,12 @@ Do not use the fallback mapping when transcoding from Unicode to the
|
||||
destination encoding.
|
||||
This is the default.
|
||||
.TP
|
||||
.BI "\-b\fP, \fB\-\-block\-size" " size"
|
||||
Read input in blocks of
|
||||
.I size
|
||||
bytes at a time. The default block size is
|
||||
4096.
|
||||
.TP
|
||||
.BI "\-f\fP, \fB\-\-from\-code" " encoding"
|
||||
Set the original encoding of the data to
|
||||
.IR encoding .
|
||||
|
@ -35,7 +35,7 @@
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
|
||||
static const size_t buffsize = 4096; /* Size of conversion buffer. */
|
||||
#define DEFAULT_BUFSZ 4096
|
||||
|
||||
static UResourceBundle *gBundle = 0; /* Bundle containing messages. */
|
||||
|
||||
@ -369,35 +369,31 @@ static UBool convertFile(const char *pname,
|
||||
UConverterFromUCallback fromucallback,
|
||||
const void *fromuctxt,
|
||||
int fallback,
|
||||
size_t bufsz,
|
||||
const char *translit,
|
||||
const char *infilestr,
|
||||
FILE * outfile, int verbose)
|
||||
{
|
||||
FILE * outfile, int verbose) {
|
||||
FILE *infile;
|
||||
UBool ret = TRUE;
|
||||
UConverter *convfrom = 0;
|
||||
UConverter *convto = 0;
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
UBool flush;
|
||||
const char *cbuffiter;
|
||||
char *buffiter;
|
||||
const size_t readsize = buffsize - 1;
|
||||
char *buff = 0;
|
||||
const char *cbufp;
|
||||
char *bufp;
|
||||
char *buf = 0;
|
||||
|
||||
uint32_t foffset = 0; /* Where we are in the file, for error reporting. */
|
||||
|
||||
UConverterFromUCallback oldfromucallback;
|
||||
UConverterToUCallback oldtoucallback;
|
||||
const void *oldcontext;
|
||||
|
||||
const UChar *cuniiter;
|
||||
UChar *uniiter;
|
||||
UChar *unibuff = 0;
|
||||
const UChar *unibufbp;
|
||||
UChar *unibufp;
|
||||
UChar *unibuf = 0;
|
||||
int32_t *fromoffsets = 0, *tooffsets = 0;
|
||||
|
||||
size_t rd, totbuffsize;
|
||||
size_t rd, tobufsz;
|
||||
|
||||
Transliterator *t = NULL;
|
||||
Transliterator *t = 0; // Transliterator acting on Unicode data.
|
||||
UnicodeString u; // String to do the transliteration.
|
||||
|
||||
// Open the correct input file or connect to stdin for reading input
|
||||
|
||||
@ -424,6 +420,7 @@ static UBool convertFile(const char *pname,
|
||||
if (verbose) {
|
||||
fprintf(stderr, "%s:\n", infilestr);
|
||||
}
|
||||
|
||||
// Create transliterator as needed.
|
||||
|
||||
if (translit != NULL && *translit) {
|
||||
@ -441,6 +438,7 @@ static UBool convertFile(const char *pname,
|
||||
goto error_exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Create codepage converter. If the codepage or its aliases weren't
|
||||
// available, it returns NULL and a failure code. We also set the
|
||||
// callbacks, and return errors in the same way.
|
||||
@ -453,8 +451,7 @@ static UBool convertFile(const char *pname,
|
||||
u_wmsg_errorName(err));
|
||||
goto error_exit;
|
||||
}
|
||||
ucnv_setToUCallBack(convfrom, toucallback, touctxt, &oldtoucallback,
|
||||
&oldcontext, &err);
|
||||
ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
|
||||
if (U_FAILURE(err)) {
|
||||
initMsg(pname);
|
||||
u_wmsg("cantSetCallback", u_wmsg_errorName(err));
|
||||
@ -469,8 +466,7 @@ static UBool convertFile(const char *pname,
|
||||
u_wmsg_errorName(err));
|
||||
goto error_exit;
|
||||
}
|
||||
ucnv_setFromUCallBack(convto, fromucallback, fromuctxt,
|
||||
&oldfromucallback, &oldcontext, &err);
|
||||
ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
|
||||
if (U_FAILURE(err)) {
|
||||
initMsg(pname);
|
||||
u_wmsg("cantSetCallback", u_wmsg_errorName(err));
|
||||
@ -479,20 +475,21 @@ static UBool convertFile(const char *pname,
|
||||
ucnv_setFallback(convto, fallback);
|
||||
|
||||
// To ensure that the buffer always is of enough size, we
|
||||
// must take the worst case scenario, that is the character in the codepage
|
||||
// that uses the most bytes and multiply it against the buffsize
|
||||
// must take the worst case scenario, that is the character in
|
||||
// the codepage that uses the most bytes and multiply it against
|
||||
// the buffer size.
|
||||
|
||||
totbuffsize = buffsize * ucnv_getMaxCharSize(convto);
|
||||
buff = new char[totbuffsize];
|
||||
unibuff = new UChar[buffsize];
|
||||
tobufsz = bufsz * ucnv_getMaxCharSize(convto);
|
||||
buf = new char[tobufsz];
|
||||
unibuf = new UChar[bufsz];
|
||||
|
||||
fromoffsets = new int32_t[buffsize];
|
||||
tooffsets = new int32_t[totbuffsize];
|
||||
fromoffsets = new int32_t[bufsz];
|
||||
tooffsets = new int32_t[tobufsz];
|
||||
|
||||
// OK, we can convert now.
|
||||
|
||||
do {
|
||||
rd = fread(buff, 1, readsize, infile);
|
||||
rd = fread(buf, 1, bufsz, infile);
|
||||
if (ferror(infile) != 0) {
|
||||
UnicodeString str(strerror(errno));
|
||||
str.append((UChar32) 0);
|
||||
@ -500,22 +497,25 @@ static UBool convertFile(const char *pname,
|
||||
u_wmsg("cantRead", str.getBuffer());
|
||||
goto error_exit;
|
||||
}
|
||||
// Convert the read buffer into the new coding
|
||||
// After the call 'uniiter' will be placed on the last character that was converted
|
||||
// in the 'unibuff'.
|
||||
// Also the 'cbuffiter' is positioned on the last converted character.
|
||||
// At the last conversion in the file, flush should be set to true so that
|
||||
// we get all characters converted
|
||||
//
|
||||
// The converter must be flushed at the end of conversion so that characters
|
||||
// on hold also will be written
|
||||
uniiter = unibuff;
|
||||
cbuffiter = buff;
|
||||
flush = rd != readsize;
|
||||
ucnv_toUnicode(convfrom, &uniiter, uniiter + buffsize, &cbuffiter,
|
||||
cbuffiter + rd, fromoffsets, flush, &err);
|
||||
|
||||
foffset += cbuffiter - buff;
|
||||
// Convert the read buffer into the new coding
|
||||
// After the call 'unibufp' will be placed on the last
|
||||
// character that was converted in the 'unibuf'.
|
||||
// Also the 'cbufp' is positioned on the last converted
|
||||
// character.
|
||||
// At the last conversion in the file, flush should be set to
|
||||
// true so that we get all characters converted
|
||||
//
|
||||
// The converter must be flushed at the end of conversion so
|
||||
// that characters on hold also will be written.
|
||||
|
||||
unibufp = unibuf;
|
||||
cbufp = buf;
|
||||
flush = rd != bufsz;
|
||||
ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp,
|
||||
cbufp + rd, fromoffsets, flush, &err);
|
||||
|
||||
foffset += cbufp - buf;
|
||||
|
||||
if (U_FAILURE(err)) {
|
||||
char pos[32];
|
||||
@ -526,9 +526,11 @@ static UBool convertFile(const char *pname,
|
||||
u_wmsg_errorName(err));
|
||||
goto error_exit;
|
||||
}
|
||||
// At the last conversion, the converted characters should be equal to number
|
||||
// of chars read.
|
||||
if (flush && cbuffiter != (buff + rd)) {
|
||||
|
||||
// At the last conversion, the converted characters should be
|
||||
// equal to number of chars read.
|
||||
|
||||
if (flush && cbufp != (buf + rd)) {
|
||||
char pos[32];
|
||||
sprintf(pos, "%u", foffset);
|
||||
UnicodeString str(pos, strlen(pos) + 1);
|
||||
@ -536,87 +538,99 @@ static UBool convertFile(const char *pname,
|
||||
u_wmsg("premEndInput", str.getBuffer());
|
||||
goto error_exit;
|
||||
}
|
||||
// Convert the Unicode buffer into the destination codepage
|
||||
// Again 'buffiter' will be placed on the last converted character
|
||||
// And 'cuniiter' will be placed on the last converted unicode character
|
||||
// At the last conversion flush should be set to true to ensure that
|
||||
// all characters left get converted
|
||||
|
||||
UnicodeString u(unibuff, uniiter - unibuff);
|
||||
buffiter = buff;
|
||||
cuniiter = unibuff;
|
||||
// Prepare to transliterate and convert.
|
||||
|
||||
if (t) {
|
||||
u.setTo(unibuf, unibufp - unibuf); // Copy into string.
|
||||
} else {
|
||||
u.setTo(unibuf, unibufp - unibuf, bufsz); // Share the buffer.
|
||||
}
|
||||
|
||||
// Transliterate if needed.
|
||||
|
||||
if (t) {
|
||||
t->transliterate(u);
|
||||
u.extract(0, u.length(), unibuff, 0);
|
||||
uniiter = unibuff + u.length();
|
||||
|
||||
}
|
||||
|
||||
ucnv_fromUnicode(convto, &buffiter, buffiter + totbuffsize,
|
||||
&cuniiter,
|
||||
cuniiter + (size_t) (uniiter - unibuff),
|
||||
tooffsets, flush, &err);
|
||||
int32_t ulen = u.length();
|
||||
|
||||
if (U_FAILURE(err)) {
|
||||
char pos[32];
|
||||
// Convert the Unicode buffer into the destination codepage
|
||||
// Again 'bufp' will be placed on the last converted character
|
||||
// And 'unibufbp' will be placed on the last converted unicode character
|
||||
// At the last conversion flush should be set to true to ensure that
|
||||
// all characters left get converted
|
||||
|
||||
uint32_t erroffset =
|
||||
dataOffset(fromoffsets, buffiter - buff, tooffsets);
|
||||
unibufbp = u.getBuffer();
|
||||
|
||||
sprintf(pos, "%u", foffset - (uniiter - unibuff) + erroffset);
|
||||
UnicodeString str(pos, strlen(pos) + 1);
|
||||
initMsg(pname);
|
||||
u_wmsg("problemCvtFromU", str.getBuffer(),
|
||||
u_wmsg_errorName(err));
|
||||
goto error_exit;
|
||||
}
|
||||
// At the last conversion, the converted characters should be equal to number
|
||||
// of consumed characters.
|
||||
if (flush && cuniiter != (unibuff + (size_t) (uniiter - unibuff))) {
|
||||
char pos[32];
|
||||
sprintf(pos, "%u", foffset);
|
||||
UnicodeString str(pos, strlen(pos) + 1);
|
||||
initMsg(pname);
|
||||
u_wmsg("premEnd", str.getBuffer());
|
||||
goto error_exit;
|
||||
}
|
||||
// Finally, write the converted buffer to the output file
|
||||
rd = (size_t) (buffiter - buff);
|
||||
if (fwrite(buff, 1, rd, outfile) != rd) {
|
||||
UnicodeString str(strerror(errno), "");
|
||||
initMsg(pname);
|
||||
u_wmsg("cantWrite", str.getBuffer());
|
||||
goto error_exit;
|
||||
}
|
||||
do {
|
||||
int32_t len = ulen > bufsz ? bufsz : ulen;
|
||||
|
||||
} while (!flush); // Stop when we have flushed the converters (this means that it's the end of output)
|
||||
bufp = buf;
|
||||
unibufp = (UChar *) (unibufbp + len);
|
||||
|
||||
ucnv_fromUnicode(convto, &bufp, bufp + tobufsz,
|
||||
&unibufbp,
|
||||
unibufp,
|
||||
tooffsets, flush, &err);
|
||||
|
||||
if (U_FAILURE(err)) {
|
||||
char pos[32];
|
||||
|
||||
uint32_t erroffset =
|
||||
dataOffset(fromoffsets, bufp - buf, tooffsets);
|
||||
|
||||
sprintf(pos, "%u", foffset - (unibufp - unibuf) + erroffset);
|
||||
UnicodeString str(pos, strlen(pos) + 1);
|
||||
initMsg(pname);
|
||||
u_wmsg("problemCvtFromU", str.getBuffer(),
|
||||
u_wmsg_errorName(err));
|
||||
goto error_exit;
|
||||
}
|
||||
|
||||
// At the last conversion, the converted characters should be equal to number
|
||||
// of consumed characters.
|
||||
if (flush && unibufbp != (unibuf + (size_t) (unibufp - unibuf))) {
|
||||
char pos[32];
|
||||
sprintf(pos, "%u", foffset);
|
||||
UnicodeString str(pos, strlen(pos) + 1);
|
||||
initMsg(pname);
|
||||
u_wmsg("premEnd", str.getBuffer());
|
||||
goto error_exit;
|
||||
}
|
||||
|
||||
// Finally, write the converted buffer to the output file
|
||||
|
||||
rd = (size_t) (bufp - buf);
|
||||
if (fwrite(buf, 1, rd, outfile) != rd) {
|
||||
UnicodeString str(strerror(errno), "");
|
||||
initMsg(pname);
|
||||
u_wmsg("cantWrite", str.getBuffer());
|
||||
goto error_exit;
|
||||
}
|
||||
} while ((ulen -= bufsz) > 0);
|
||||
} while (!flush); // Stop when we have flushed the
|
||||
// converters (this means that it's
|
||||
// the end of output)
|
||||
|
||||
goto normal_exit;
|
||||
|
||||
error_exit:
|
||||
error_exit:
|
||||
ret = FALSE;
|
||||
|
||||
normal_exit:
|
||||
// Close the created converters
|
||||
normal_exit:
|
||||
// Cleanup.
|
||||
|
||||
if (convfrom)
|
||||
ucnv_close(convfrom);
|
||||
if (convto)
|
||||
ucnv_close(convto);
|
||||
if (convfrom) ucnv_close(convfrom);
|
||||
if (convto) ucnv_close(convto);
|
||||
|
||||
if (t)
|
||||
delete t;
|
||||
if (t) delete t;
|
||||
|
||||
if (buff)
|
||||
delete[]buff;
|
||||
if (unibuff)
|
||||
delete[]unibuff;
|
||||
if (buf) delete[] buf;
|
||||
if (unibuf) delete[] unibuf;
|
||||
|
||||
if (fromoffsets)
|
||||
delete[]fromoffsets;
|
||||
if (tooffsets)
|
||||
delete[]tooffsets;
|
||||
if (fromoffsets) delete[] fromoffsets;
|
||||
if (tooffsets) delete[] tooffsets;
|
||||
|
||||
if (infile != stdin) {
|
||||
fclose(infile);
|
||||
@ -625,8 +639,7 @@ static UBool convertFile(const char *pname,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void usage(const char *pname, int ecode)
|
||||
{
|
||||
static void usage(const char *pname, int ecode) {
|
||||
const UChar *msg;
|
||||
int32_t msgLen;
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
@ -662,6 +675,8 @@ int main(int argc, char **argv)
|
||||
int ret = 0;
|
||||
int seenf = 0;
|
||||
|
||||
size_t bufsz = DEFAULT_BUFSZ;
|
||||
|
||||
const char *fromcpage = 0;
|
||||
const char *tocpage = 0;
|
||||
const char *translit = 0;
|
||||
@ -716,6 +731,20 @@ int main(int argc, char **argv)
|
||||
fallback = 1;
|
||||
} else if (!strcmp("--no-fallback", *iter)) {
|
||||
fallback = 0;
|
||||
} else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
|
||||
iter++;
|
||||
if (iter != end) {
|
||||
bufsz = atoi(*iter);
|
||||
if ((int) bufsz <= 0) {
|
||||
initMsg(pname);
|
||||
UnicodeString str(*iter);
|
||||
initMsg(pname);
|
||||
u_wmsg("badBlockSize", str.getBuffer());
|
||||
return 3;
|
||||
}
|
||||
} else {
|
||||
usage(pname, 1);
|
||||
}
|
||||
} else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
|
||||
if (printTranslits) {
|
||||
usage(pname, 1);
|
||||
@ -885,6 +914,8 @@ int main(int argc, char **argv)
|
||||
;
|
||||
} else if (!strcmp("--no-fallback", *iter)) {
|
||||
;
|
||||
} else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
|
||||
iter++;
|
||||
} else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
|
||||
;
|
||||
} else if (strcmp("--default-code", *iter) == 0) {
|
||||
@ -921,7 +952,7 @@ int main(int argc, char **argv)
|
||||
seenf = 1;
|
||||
if (!convertFile
|
||||
(pname, fromcpage, toucallback, touctxt, tocpage,
|
||||
fromucallback, fromuctxt, fallback, translit, *iter,
|
||||
fromucallback, fromuctxt, fallback, bufsz, translit, *iter,
|
||||
outfile, verbose)) {
|
||||
goto error_exit;
|
||||
}
|
||||
@ -931,7 +962,7 @@ int main(int argc, char **argv)
|
||||
if (!seenf) {
|
||||
if (!convertFile
|
||||
(pname, fromcpage, toucallback, touctxt, tocpage,
|
||||
fromucallback, fromuctxt, fallback, translit, 0, outfile,
|
||||
fromucallback, fromuctxt, fallback, bufsz, translit, 0, outfile,
|
||||
verbose)) {
|
||||
goto error_exit;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user