ICU-2983 fix offset handling and error output

X-SVN-Rev: 14206
2003-12-21 21:49:37 +00:00 · 2003-12-21 21:49:37 +00:00 · 1b6f79bd4e
commit 1b6f79bd4e
parent a34698b244
3 changed files with 169 additions and 62 deletions
--- a/icu4c/source/extra/uconv/resources/fr.txt
+++ b/icu4c/source/extra/uconv/resources/fr.txt
@ -1,6 +1,6 @@
 // -*- Coding: utf-8; -*-  [all uconv resource files]
-// Copyright (c) 2000 IBM, Inc. and Others.
-// $Revision: 1.2 $
+// Copyright (c) 2000-2003 IBM, Inc. and Others.
+// $Revision: 1.3 $
 //
 // Root translation file for uconv messages.
 // So you want to translate this file??? Great!
@ -28,8 +28,21 @@ fr

    lcUsageWord { "usage" } 
    ucUsageWord { "Usage" } 
-    usage { "{0} : {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x translitération ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] [ --fallback | --no-fallback ] [ -b, --block-size taille ] [ -f, --from-code code ] [ -t, --to-code code ] [ -o, --output fichier ] [ fichier ... ]\n" } 
+    usage {
+        "{0}: {1} "
+        "[ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] "
+        "[ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] "
+        "[ --canon ] [ -x translitération ] "
+        "[ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] "
+        "[ --fallback | --no-fallback ] "
+        "[ -b, --block-size taille ] "
+        "[ -f, --from-code code ] [ -t, --to-code code ] "
+        "[ --add-signature ] [ --remove-signature ] "
+        "[ -o, --output fichier ] "
+        "[ fichier ... ]\n"
+    }

+    // TODO there is some English in here
    help { "Options :  -h, --help                   affiche ce message\n"
           "           -V, --version                affiche la version du programme\n"
 "           -s, --silent                 supprime les messages\n"
@ -50,6 +63,8 @@ fr
 "           --no-fallback                n''utilise pas les correspondances de secours\n"
 "           -f, --from-code code         fixe l''encodage d''origine\n"
 "           -t, --to-code code           fixe l''encodage de destination\n"
+"          --add-signature               add a U+FEFF Unicode signature character (BOM)\n"
+"          --remove-signature            remove a U+FEFF Unicode signature character (BOM)\n"
 "           -o, --output fichier         écrit la sortie dans fichier\n"
 "\n"
 "Callbacks :" } 
@ -82,10 +97,8 @@ fr
    cantWrite { "Le texte converti ne peut pas être écrit : {0}.\n" } // 0: OS error string
    cantRead { "Erreur de lecture du fichier d''entrée : {0}.\n" } // 0: OS error string

-    premEnd { "Fin prématurée de la conversion d''Unicode vers l''encodage de destination à la position {0}.\n" } // 0: position
-    premEndInput { "Fin prématurée de l''entrée durant la conversion de l''encodage original vers Unicode à la position {0}.\n" } // 0: position
-
-    problemCvtToU { "La conversion d''Unicode vers l''encodage de destination a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
-    problemCvtFromU { "La conversion de l''encodage original vers Unicode a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
-    problemCvtFromUOut { "La conversion de l''encodage original vers Unicode a échoué à la position {0} de la sortie : {1}.\n" } // 0: position, 1: err
+    // TODO retranslate the problemCvt... messages because their format changed
+    //problemCvtToU { "La conversion d''Unicode vers l''encodage de destination a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
+    //problemCvtFromU { "La conversion de l''encodage original vers Unicode a échoué à la position {0} : {1}.\n" } // 0: position, 1: err
+    //problemCvtFromUOut { "La conversion de l''encodage original vers Unicode a échoué à la position {0} de la sortie : {1}.\n" } // 0: position, 1: err
 }
--- a/icu4c/source/extra/uconv/resources/root.txt
+++ b/icu4c/source/extra/uconv/resources/root.txt
@ -1,6 +1,6 @@
 // -*- Coding: utf-8; -*-  [all uconv resource files]
 // Copyright (c) 2000-2003 IBM, Inc. and Others.
-// $Revision: 1.3 $
+// $Revision: 1.4 $
 //
 // Root translation file for uconv messages.
 // So you want to translate this file??? Great!
@ -96,12 +96,9 @@ root
  cantWrite       { "The converted text couldn't be written: {0}.\n" } // 0: OS error string
  cantRead        { "Error reading from input file: {0}.\n" } // 0: OS error string

-  premEnd         { "Premature end of Unicode to destination encoding conversion at position {0}.\n" } // 0: position
-  premEndInput    { "Premature end of input when converting from original encoding to Unicode at position {0}.\n" } // 0: position
-
-  problemCvtToU   { "Conversion to Unicode from codepage failed at position {0}: {1}.\n" } // 0: position, 1: err
-  problemCvtFromU { "Conversion from Unicode to codepage failed at position {0}: {1}.\n"} // 0: position, 1: err
-  problemCvtFromUOut { "Conversion from Unicode to codepage failed at position {0} in output: {1}.\n"} // 0: position, 1: err
+  problemCvtToU   { "Conversion to Unicode from codepage failed at input byte position {0}. Bytes: {1} Error: {2}\n" } // 0: position, 1: bytes, 2: err
+  problemCvtFromU { "Conversion from Unicode to codepage failed at input byte position {0}. Unicode: {1} Error: {2}\n"} // 0: position, 1: Unicode, 2: err
+  problemCvtFromUOut { "Conversion from Unicode to codepage failed at output byte position {0}. Unicode: {1} Error: {2}\n"} // 0: position, 1: Unicode, 2: err

 // ICU errors - used by u_wmsg_errorName()

--- a/icu4c/source/extra/uconv/uconv.cpp
+++ b/icu4c/source/extra/uconv/uconv.cpp
@ -50,6 +50,8 @@
 U_CFUNC char uconvmsg_dat[];
 #endif

+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
 #define DEFAULT_BUFSZ   4096
 #define UCONVMSG "uconvmsg"

@ -270,7 +272,7 @@ static int printConverters(const char *pname, const char *lookfor,
        if (U_FAILURE(err)) {
            printf("%s", name);

-            UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
+            UnicodeString str(name, "");
            putchar('\t');
            u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
                u_wmsg_errorName(err));
@ -284,7 +286,7 @@ static int printConverters(const char *pname, const char *lookfor,
                const char *alias = ucnv_getAlias(name, a, &err);

                if (U_FAILURE(err)) {
-                    UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
+                    UnicodeString str(name, "");
                    putchar('\t');
                    u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
                        u_wmsg_errorName(err));
@ -419,6 +421,15 @@ enum {
    CNV_ADDS_FEFF   // automatically adds/detects the U+FEFF signature character
 };

+inline UChar
+nibbleToHex(uint8_t n) {
+    n &= 0xf;
+    return
+        n <= 9 ?
+            (UChar)(0x30 + n) :
+            (UChar)((0x61 - 10) + n);
+}
+
 // check the converter's Unicode signature properties;
 // the fromUnicode side of the converter must be in its initial state
 // and will be reset again if it was used
@ -465,26 +476,10 @@ cnvSigType(UConverter *cnv) {
    return result;
 }

-/* Return the offset of a byte in its source, given the from and to offsets
-   vectors and the byte offset itself. */
-
-static inline int32_t dataOffset(int32_t whereto,
-    const int32_t *fromoffsets, int32_t fromsz,
-    const int32_t *tooffsets, int32_t tosz) {
-    if (whereto >= 0 && whereto < tosz) {
-        whereto = tooffsets[whereto];
-        if (whereto >= 0 && whereto < fromsz) {
-            return fromoffsets[whereto];
-        }
-    }
-
-    return 0;
-}
-
 class ConvertFile {
 public:
    ConvertFile() :
-        buf(NULL), fromoffsets(NULL), tooffsets(NULL),
+        buf(NULL), fromoffsets(NULL),
        bufsz(0), signature(0) {}

    void
@ -493,9 +488,8 @@ public:

        buf = new char[bufsz];

-        // allocate one memory block for both offsets arrays
-        fromoffsets = new int32_t[2 * bufsz];
-        tooffsets = fromoffsets + bufsz;
+        // +1 for an added U+FEFF in the intermediate Unicode buffer
+        fromoffsets = new int32_t[bufsz + 1];
    }

    ~ConvertFile() {
@ -518,7 +512,7 @@ private:
    friend extern int main(int argc, char **argv);

    char *buf;
-    int32_t *fromoffsets, *tooffsets;
+    int32_t *fromoffsets;

    size_t bufsz;
    int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
@ -544,7 +538,7 @@ ConvertFile::convertFile(const char *pname,
    UConverter *convto = 0;
    UErrorCode err = U_ZERO_ERROR;
    UBool flush;
-    const char *cbufp;
+    const char *cbufp, *prevbufp;
    char *bufp;

    uint32_t infoffset = 0, outfoffset = 0;   /* Where we are in the file, for error reporting. */
@ -636,7 +630,7 @@ ConvertFile::convertFile(const char *pname,

    convfrom = ucnv_open(fromcpage, &err);
    if (U_FAILURE(err)) {
-        UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1));
+        UnicodeString str(fromcpage, "");
        initMsg(pname);
        u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
            u_wmsg_errorName(err));
@ -651,7 +645,7 @@ ConvertFile::convertFile(const char *pname,

    convto = ucnv_open(tocpage, &err);
    if (U_FAILURE(err)) {
-        UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1));
+        UnicodeString str(tocpage, "");
        initMsg(pname);
        u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
            u_wmsg_errorName(err));
@ -670,10 +664,14 @@ ConvertFile::convertFile(const char *pname,

    // OK, we can convert now.
    sig = signature;
+    rd = 0;

    do {
        willexit = FALSE;

+        // input file offset at the beginning of the next buffer
+        infoffset += rd;
+
        rd = fread(buf, 1, bufsz, infile);
        if (ferror(infile) != 0) {
            UnicodeString str(strerror(errno));
@ -698,18 +696,19 @@ ConvertFile::convertFile(const char *pname,

        // convert until the input is consumed
        do {
+            // remember the start of the current byte-to-Unicode conversion
+            prevbufp = cbufp;
+
            unibuf = unibufp = u.getBuffer((int32_t)bufsz);

            // Use bufsz instead of u.getCapacity() for the targetLimit
            // so that we don't overflow fromoffsets[].
            ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
-                cbufp + rd, fromoffsets, flush, &err);
+                buf + rd, fromoffsets, flush, &err);

            ulen = (int32_t)(unibufp - unibuf);
            u.releaseBuffer(ulen);

-            infoffset += (uint32_t)(cbufp - buf);
-
            // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
            // converting all of the input bytes.
            // It works like this because ucnv_toUnicode() returns only under the
@ -724,11 +723,40 @@ ConvertFile::convertFile(const char *pname,
            if (err == U_BUFFER_OVERFLOW_ERROR) {
                err = U_ZERO_ERROR;
            } else if (U_FAILURE(err)) {
-                char pos[32];
-                sprintf(pos, "%u", infoffset - 1);
-                UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
+                char pos[32], errorBytes[32];
+                int8_t i, length, errorLength;
+
+                UErrorCode localError = U_ZERO_ERROR;
+                errorLength = (int8_t)sizeof(errorBytes);
+                ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
+                if (U_FAILURE(localError) || errorLength == 0) {
+                    errorLength = 1;
+                }
+
+                // print the input file offset of the start of the error bytes:
+                // input file offset of the current byte buffer +
+                // length of the just consumed bytes -
+                // length of the error bytes
+                length =
+                    (int8_t)sprintf(pos, "%d",
+                        (int)(infoffset + (cbufp - buf) - errorLength));
+
+                // output the bytes that caused the error
+                UnicodeString str;
+                for (i = 0; i < errorLength; ++i) {
+                    if (i > 0) {
+                        str.append((UChar)0x20);
+                    }
+                    str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
+                    str.append(nibbleToHex((uint8_t)errorBytes[i]));
+                }
+
                initMsg(pname);
-                u_wmsg(stderr, "problemCvtToU", str.getTerminatedBuffer(), u_wmsg_errorName(err));
+                u_wmsg(stderr, "problemCvtToU",
+                        UnicodeString(pos, length, "").getTerminatedBuffer(),
+                        str.getTerminatedBuffer(),
+                        u_wmsg_errorName(err));
+
                willexit = TRUE;
                err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
            }
@ -744,7 +772,14 @@ ConvertFile::convertFile(const char *pname,
            if (sig < 0) {
                if (u.charAt(0) == 0xfeff) {
                    u.remove(0, 1);
+
+                    // account for the removed UChar and offset
                    --ulen;
+
+                    // remove an offset from fromoffsets[] as well
+                    // to keep the array parallel with the UChars
+                    memmove(fromoffsets, fromoffsets + 1, ulen * 4);
+
                }
                sig = 0;
            }
@ -770,6 +805,13 @@ ConvertFile::convertFile(const char *pname,
            if (sig > 0) {
                if (u.charAt(0) != 0xfeff && cnvSigType(convto) == CNV_WITH_FEFF) {
                    u.insert(0, (UChar)0xfeff);
+
+                    // insert a pseudo-offset into fromoffsets[] as well
+                    // to keep the array parallel with the UChars
+                    memmove(fromoffsets + 1, fromoffsets, ulen * 4);
+                    fromoffsets[0] = -1;
+
+                    // account for the additional UChar and offset
                    ++ulen;
                }
                sig = 0;
@ -792,7 +834,7 @@ ConvertFile::convertFile(const char *pname,
                ucnv_fromUnicode(convto, &bufp, buf + bufsz,
                                 &unibufbp,
                                 unibuf + ulen,
-                                 tooffsets, (UBool)(flush && fromSawEndOfBytes), &err);
+                                 NULL, (UBool)(flush && fromSawEndOfBytes), &err);

                // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
                // converting all of the intermediate UChars.
@ -802,27 +844,82 @@ ConvertFile::convertFile(const char *pname,
                if (err == U_BUFFER_OVERFLOW_ERROR) {
                    err = U_ZERO_ERROR;
                } else if (U_FAILURE(err)) {
+                    UChar errorUChars[4];
                    const char *errtag;
                    char pos[32];
+                    UChar32 c;
+                    int8_t i, length, errorLength;

-                    // TODO Do not use fromoffsets if (t != NULL) because the Unicode text may
-                    // be different from what the offsets refer to.
-                    uint32_t erroffset =
-                        dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(bufsz));
-                    int32_t ferroffset = (int32_t)(infoffset - ulen + erroffset);
+                    UErrorCode localError = U_ZERO_ERROR;
+                    errorLength = (int8_t)LENGTHOF(errorUChars);
+                    ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
+                    if (U_FAILURE(localError) || errorLength == 0) {
+                        // need at least 1 so that we don't access beyond the length of fromoffsets[]
+                        errorLength = 1;
+                    }

-                    if ((int32_t) ferroffset < 0) {
+                    int32_t ferroffset;
+
+                    if (t == NULL) {
+                        // Unicode buffer offset of the start of the error UChars
+                        ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
+                        if (ferroffset < 0) {
+                            // approximation - the character started in the previous Unicode buffer
+                            ferroffset = 0;
+                        }
+
+                        // get the corresponding byte offset out of fromoffsets[]
+                        // go back if the offset is not known for some of the UChars
+                        int32_t fromoffset;
+                        do {
+                            fromoffset = fromoffsets[ferroffset];
+                        } while (fromoffset < 0 && --ferroffset >= 0);
+
+                        // total input file offset =
+                        // input file offset of the current byte buffer +
+                        // byte buffer offset of where the current Unicode buffer is converted from +
+                        // fromoffsets[Unicode offset]
+                        ferroffset = infoffset + (prevbufp - buf) + fromoffset;
+                        errtag = "problemCvtFromU";
+                    } else {
+                        // Do not use fromoffsets if (t != NULL) because the Unicode text may
+                        // be different from what the offsets refer to.
+
+                        // output file offset
                        ferroffset = (int32_t)(outfoffset + (bufp - buf));
                        errtag = "problemCvtFromUOut";
-                    } else {
-                        errtag = "problemCvtFromU";
                    }
-                    sprintf(pos, "%u", ferroffset);
-                    UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
+
+                    length = (int8_t)sprintf(pos, "%u", ferroffset);
+
+                    // output the code points that caused the error
+                    UnicodeString str;
+                    for (i = 0; i < errorLength;) {
+                        if (i > 0) {
+                            str.append((UChar)0x20);
+                        }
+                        U16_NEXT(errorUChars, i, errorLength, c);
+                        if (c >= 0x100000) {
+                            str.append(nibbleToHex((uint8_t)(c >> 20)));
+                        }
+                        if (c >= 0x10000) {
+                            str.append(nibbleToHex((uint8_t)(c >> 16)));
+                        }
+                        str.append(nibbleToHex((uint8_t)(c >> 12)));
+                        str.append(nibbleToHex((uint8_t)(c >> 8)));
+                        str.append(nibbleToHex((uint8_t)(c >> 4)));
+                        str.append(nibbleToHex((uint8_t)c));
+                    }
+
                    initMsg(pname);
-                    u_wmsg(stderr, errtag, str.getTerminatedBuffer(),
+                    u_wmsg(stderr, errtag,
+                            UnicodeString(pos, length, "").getTerminatedBuffer(),
+                            str.getTerminatedBuffer(),
                           u_wmsg_errorName(err));
+                    u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
+
                    willexit = TRUE;
+                    err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
                }

                // Replaced a check for whether the intermediate Unicode characters were all consumed by
@ -832,7 +929,7 @@ ConvertFile::convertFile(const char *pname,
                size_t outlen = (size_t) (bufp - buf);
                outfoffset += (int32_t)(wr = fwrite(buf, 1, outlen, outfile));
                if (wr != outlen) {
-                    UnicodeString str(strerror(errno), "");
+                    UnicodeString str(strerror(errno));
                    initMsg(pname);
                    u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
                    willexit = TRUE;