ICU-2983 fix offset handling and error output

X-SVN-Rev: 14206
This commit is contained in:
Markus Scherer 2003-12-21 21:49:37 +00:00
parent a34698b244
commit 1b6f79bd4e
3 changed files with 169 additions and 62 deletions

View File

@ -1,6 +1,6 @@
// -*- Coding: utf-8; -*- [all uconv resource files]
// Copyright (c) 2000 IBM, Inc. and Others.
// $Revision: 1.2 $
// Copyright (c) 2000-2003 IBM, Inc. and Others.
// $Revision: 1.3 $
//
// Root translation file for uconv messages.
// So you want to translate this file??? Great!
@ -28,8 +28,21 @@ fr
lcUsageWord { "usage" }
ucUsageWord { "Usage" }
usage { "{0} : {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x translitération ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] [ --fallback | --no-fallback ] [ -b, --block-size taille ] [ -f, --from-code code ] [ -t, --to-code code ] [ -o, --output fichier ] [ fichier ... ]\n" }
usage {
"{0}: {1} "
"[ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] "
"[ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] "
"[ --canon ] [ -x translitération ] "
"[ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] "
"[ --fallback | --no-fallback ] "
"[ -b, --block-size taille ] "
"[ -f, --from-code code ] [ -t, --to-code code ] "
"[ --add-signature ] [ --remove-signature ] "
"[ -o, --output fichier ] "
"[ fichier ... ]\n"
}
// TODO there is some English in here
help { "Options : -h, --help affiche ce message\n"
" -V, --version affiche la version du programme\n"
" -s, --silent supprime les messages\n"
@ -50,6 +63,8 @@ fr
" --no-fallback n''utilise pas les correspondances de secours\n"
" -f, --from-code code fixe l''encodage d''origine\n"
" -t, --to-code code fixe l''encodage de destination\n"
" --add-signature add a U+FEFF Unicode signature character (BOM)\n"
" --remove-signature remove a U+FEFF Unicode signature character (BOM)\n"
" -o, --output fichier écrit la sortie dans fichier\n"
"\n"
"Callbacks :" }
@ -82,10 +97,8 @@ fr
cantWrite { "Le texte converti ne peut pas être écrit : {0}.\n" } // 0: OS error string
cantRead { "Erreur de lecture du fichier d''entrée : {0}.\n" } // 0: OS error string
premEnd { "Fin prématurée de la conversion d''Unicode vers l''encodage de destination à la position {0}.\n" } // 0: position
premEndInput { "Fin prématurée de l''entrée durant la conversion de l''encodage original vers Unicode à la position {0}.\n" } // 0: position
problemCvtToU { "La conversion d''Unicode vers l''encodage de destination a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
problemCvtFromU { "La conversion de l''encodage original vers Unicode a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
problemCvtFromUOut { "La conversion de l''encodage original vers Unicode a échoué à la position {0} de la sortie : {1}.\n" } // 0: position, 1: err
// TODO retranslate the problemCvt... messages because their format changed
//problemCvtToU { "La conversion d''Unicode vers l''encodage de destination a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
//problemCvtFromU { "La conversion de l''encodage original vers Unicode a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
//problemCvtFromUOut { "La conversion de l''encodage original vers Unicode a échoué à la position {0} de la sortie : {1}.\n" } // 0: position, 1: err
}

View File

@ -1,6 +1,6 @@
// -*- Coding: utf-8; -*- [all uconv resource files]
// Copyright (c) 2000-2003 IBM, Inc. and Others.
// $Revision: 1.3 $
// $Revision: 1.4 $
//
// Root translation file for uconv messages.
// So you want to translate this file??? Great!
@ -96,12 +96,9 @@ root
cantWrite { "The converted text couldn't be written: {0}.\n" } // 0: OS error string
cantRead { "Error reading from input file: {0}.\n" } // 0: OS error string
premEnd { "Premature end of Unicode to destination encoding conversion at position {0}.\n" } // 0: position
premEndInput { "Premature end of input when converting from original encoding to Unicode at position {0}.\n" } // 0: position
problemCvtToU { "Conversion to Unicode from codepage failed at position {0}: {1}.\n" } // 0: position, 1: err
problemCvtFromU { "Conversion from Unicode to codepage failed at position {0}: {1}.\n"} // 0: position, 1: err
problemCvtFromUOut { "Conversion from Unicode to codepage failed at position {0} in output: {1}.\n"} // 0: position, 1: err
problemCvtToU { "Conversion to Unicode from codepage failed at input byte position {0}. Bytes: {1} Error: {2}\n" } // 0: position, 1: bytes, 2: err
problemCvtFromU { "Conversion from Unicode to codepage failed at input byte position {0}. Unicode: {1} Error: {2}\n"} // 0: position, 1: Unicode, 2: err
problemCvtFromUOut { "Conversion from Unicode to codepage failed at output byte position {0}. Unicode: {1} Error: {2}\n"} // 0: position, 1: Unicode, 2: err
// ICU errors - used by u_wmsg_errorName()

View File

@ -50,6 +50,8 @@
U_CFUNC char uconvmsg_dat[];
#endif
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#define DEFAULT_BUFSZ 4096
#define UCONVMSG "uconvmsg"
@ -270,7 +272,7 @@ static int printConverters(const char *pname, const char *lookfor,
if (U_FAILURE(err)) {
printf("%s", name);
UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
UnicodeString str(name, "");
putchar('\t');
u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
@ -284,7 +286,7 @@ static int printConverters(const char *pname, const char *lookfor,
const char *alias = ucnv_getAlias(name, a, &err);
if (U_FAILURE(err)) {
UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
UnicodeString str(name, "");
putchar('\t');
u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
@ -419,6 +421,15 @@ enum {
CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
};
inline UChar
nibbleToHex(uint8_t n) {
n &= 0xf;
return
n <= 9 ?
(UChar)(0x30 + n) :
(UChar)((0x61 - 10) + n);
}
// check the converter's Unicode signature properties;
// the fromUnicode side of the converter must be in its initial state
// and will be reset again if it was used
@ -465,26 +476,10 @@ cnvSigType(UConverter *cnv) {
return result;
}
/* Return the offset of a byte in its source, given the from and to offsets
vectors and the byte offset itself. */
static inline int32_t dataOffset(int32_t whereto,
const int32_t *fromoffsets, int32_t fromsz,
const int32_t *tooffsets, int32_t tosz) {
if (whereto >= 0 && whereto < tosz) {
whereto = tooffsets[whereto];
if (whereto >= 0 && whereto < fromsz) {
return fromoffsets[whereto];
}
}
return 0;
}
class ConvertFile {
public:
ConvertFile() :
buf(NULL), fromoffsets(NULL), tooffsets(NULL),
buf(NULL), fromoffsets(NULL),
bufsz(0), signature(0) {}
void
@ -493,9 +488,8 @@ public:
buf = new char[bufsz];
// allocate one memory block for both offsets arrays
fromoffsets = new int32_t[2 * bufsz];
tooffsets = fromoffsets + bufsz;
// +1 for an added U+FEFF in the intermediate Unicode buffer
fromoffsets = new int32_t[bufsz + 1];
}
~ConvertFile() {
@ -518,7 +512,7 @@ private:
friend extern int main(int argc, char **argv);
char *buf;
int32_t *fromoffsets, *tooffsets;
int32_t *fromoffsets;
size_t bufsz;
int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
@ -544,7 +538,7 @@ ConvertFile::convertFile(const char *pname,
UConverter *convto = 0;
UErrorCode err = U_ZERO_ERROR;
UBool flush;
const char *cbufp;
const char *cbufp, *prevbufp;
char *bufp;
uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
@ -636,7 +630,7 @@ ConvertFile::convertFile(const char *pname,
convfrom = ucnv_open(fromcpage, &err);
if (U_FAILURE(err)) {
UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1));
UnicodeString str(fromcpage, "");
initMsg(pname);
u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
@ -651,7 +645,7 @@ ConvertFile::convertFile(const char *pname,
convto = ucnv_open(tocpage, &err);
if (U_FAILURE(err)) {
UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1));
UnicodeString str(tocpage, "");
initMsg(pname);
u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
@ -670,10 +664,14 @@ ConvertFile::convertFile(const char *pname,
// OK, we can convert now.
sig = signature;
rd = 0;
do {
willexit = FALSE;
// input file offset at the beginning of the next buffer
infoffset += rd;
rd = fread(buf, 1, bufsz, infile);
if (ferror(infile) != 0) {
UnicodeString str(strerror(errno));
@ -698,18 +696,19 @@ ConvertFile::convertFile(const char *pname,
// convert until the input is consumed
do {
// remember the start of the current byte-to-Unicode conversion
prevbufp = cbufp;
unibuf = unibufp = u.getBuffer((int32_t)bufsz);
// Use bufsz instead of u.getCapacity() for the targetLimit
// so that we don't overflow fromoffsets[].
ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
cbufp + rd, fromoffsets, flush, &err);
buf + rd, fromoffsets, flush, &err);
ulen = (int32_t)(unibufp - unibuf);
u.releaseBuffer(ulen);
infoffset += (uint32_t)(cbufp - buf);
// fromSawEndOfBytes indicates that ucnv_toUnicode() is done
// converting all of the input bytes.
// It works like this because ucnv_toUnicode() returns only under the
@ -724,11 +723,40 @@ ConvertFile::convertFile(const char *pname,
if (err == U_BUFFER_OVERFLOW_ERROR) {
err = U_ZERO_ERROR;
} else if (U_FAILURE(err)) {
char pos[32];
sprintf(pos, "%u", infoffset - 1);
UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
char pos[32], errorBytes[32];
int8_t i, length, errorLength;
UErrorCode localError = U_ZERO_ERROR;
errorLength = (int8_t)sizeof(errorBytes);
ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
if (U_FAILURE(localError) || errorLength == 0) {
errorLength = 1;
}
// print the input file offset of the start of the error bytes:
// input file offset of the current byte buffer +
// length of the just consumed bytes -
// length of the error bytes
length =
(int8_t)sprintf(pos, "%d",
(int)(infoffset + (cbufp - buf) - errorLength));
// output the bytes that caused the error
UnicodeString str;
for (i = 0; i < errorLength; ++i) {
if (i > 0) {
str.append((UChar)0x20);
}
str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
str.append(nibbleToHex((uint8_t)errorBytes[i]));
}
initMsg(pname);
u_wmsg(stderr, "problemCvtToU", str.getTerminatedBuffer(), u_wmsg_errorName(err));
u_wmsg(stderr, "problemCvtToU",
UnicodeString(pos, length, "").getTerminatedBuffer(),
str.getTerminatedBuffer(),
u_wmsg_errorName(err));
willexit = TRUE;
err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
}
@ -744,7 +772,14 @@ ConvertFile::convertFile(const char *pname,
if (sig < 0) {
if (u.charAt(0) == 0xfeff) {
u.remove(0, 1);
// account for the removed UChar and offset
--ulen;
// remove an offset from fromoffsets[] as well
// to keep the array parallel with the UChars
memmove(fromoffsets, fromoffsets + 1, ulen * 4);
}
sig = 0;
}
@ -770,6 +805,13 @@ ConvertFile::convertFile(const char *pname,
if (sig > 0) {
if (u.charAt(0) != 0xfeff && cnvSigType(convto) == CNV_WITH_FEFF) {
u.insert(0, (UChar)0xfeff);
// insert a pseudo-offset into fromoffsets[] as well
// to keep the array parallel with the UChars
memmove(fromoffsets + 1, fromoffsets, ulen * 4);
fromoffsets[0] = -1;
// account for the additional UChar and offset
++ulen;
}
sig = 0;
@ -792,7 +834,7 @@ ConvertFile::convertFile(const char *pname,
ucnv_fromUnicode(convto, &bufp, buf + bufsz,
&unibufbp,
unibuf + ulen,
tooffsets, (UBool)(flush && fromSawEndOfBytes), &err);
NULL, (UBool)(flush && fromSawEndOfBytes), &err);
// toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
// converting all of the intermediate UChars.
@ -802,27 +844,82 @@ ConvertFile::convertFile(const char *pname,
if (err == U_BUFFER_OVERFLOW_ERROR) {
err = U_ZERO_ERROR;
} else if (U_FAILURE(err)) {
UChar errorUChars[4];
const char *errtag;
char pos[32];
UChar32 c;
int8_t i, length, errorLength;
// TODO Do not use fromoffsets if (t != NULL) because the Unicode text may
UErrorCode localError = U_ZERO_ERROR;
errorLength = (int8_t)LENGTHOF(errorUChars);
ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
if (U_FAILURE(localError) || errorLength == 0) {
// need at least 1 so that we don't access beyond the length of fromoffsets[]
errorLength = 1;
}
int32_t ferroffset;
if (t == NULL) {
// Unicode buffer offset of the start of the error UChars
ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
if (ferroffset < 0) {
// approximation - the character started in the previous Unicode buffer
ferroffset = 0;
}
// get the corresponding byte offset out of fromoffsets[]
// go back if the offset is not known for some of the UChars
int32_t fromoffset;
do {
fromoffset = fromoffsets[ferroffset];
} while (fromoffset < 0 && --ferroffset >= 0);
// total input file offset =
// input file offset of the current byte buffer +
// byte buffer offset of where the current Unicode buffer is converted from +
// fromoffsets[Unicode offset]
ferroffset = infoffset + (prevbufp - buf) + fromoffset;
errtag = "problemCvtFromU";
} else {
// Do not use fromoffsets if (t != NULL) because the Unicode text may
// be different from what the offsets refer to.
uint32_t erroffset =
dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(bufsz));
int32_t ferroffset = (int32_t)(infoffset - ulen + erroffset);
if ((int32_t) ferroffset < 0) {
// output file offset
ferroffset = (int32_t)(outfoffset + (bufp - buf));
errtag = "problemCvtFromUOut";
} else {
errtag = "problemCvtFromU";
}
sprintf(pos, "%u", ferroffset);
UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
length = (int8_t)sprintf(pos, "%u", ferroffset);
// output the code points that caused the error
UnicodeString str;
for (i = 0; i < errorLength;) {
if (i > 0) {
str.append((UChar)0x20);
}
U16_NEXT(errorUChars, i, errorLength, c);
if (c >= 0x100000) {
str.append(nibbleToHex((uint8_t)(c >> 20)));
}
if (c >= 0x10000) {
str.append(nibbleToHex((uint8_t)(c >> 16)));
}
str.append(nibbleToHex((uint8_t)(c >> 12)));
str.append(nibbleToHex((uint8_t)(c >> 8)));
str.append(nibbleToHex((uint8_t)(c >> 4)));
str.append(nibbleToHex((uint8_t)c));
}
initMsg(pname);
u_wmsg(stderr, errtag, str.getTerminatedBuffer(),
u_wmsg(stderr, errtag,
UnicodeString(pos, length, "").getTerminatedBuffer(),
str.getTerminatedBuffer(),
u_wmsg_errorName(err));
u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
willexit = TRUE;
err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
}
// Replaced a check for whether the intermediate Unicode characters were all consumed by
@ -832,7 +929,7 @@ ConvertFile::convertFile(const char *pname,
size_t outlen = (size_t) (bufp - buf);
outfoffset += (int32_t)(wr = fwrite(buf, 1, outlen, outfile));
if (wr != outlen) {
UnicodeString str(strerror(errno), "");
UnicodeString str(strerror(errno));
initMsg(pname);
u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
willexit = TRUE;