ICU-1220 Added escape-codepoint as a callback to let people use the

new UCNV_ESCAPE_CODEPOINT. Options compatibility with iconv(1): added -V, --version; -s, --silent; --verbose (and -v because of ICU commands); -o, --output file. uconv(1) is now mostly compatible with iconv(1) usage, except for the fact that one cannot convert many files in one run yet. X-SVN-Rev: 7414
2002-01-09 01:04:32 +00:00 · 2002-01-09 01:04:32 +00:00 · 3b470abbe3
commit 3b470abbe3
parent 85a889c4c2
3 changed files with 152 additions and 26 deletions
--- a/icu4c/source/extra/uconv/root.txt
+++ b/icu4c/source/extra/uconv/root.txt
@ -1,6 +1,6 @@
 // -*- Coding: utf-8; -*-  [all uconv resource files]
 // Copyright (c) 2000 IBM, Inc. and Others.
-// $Revision: 1.16 $
+// $Revision: 1.17 $
 //
 // Root translation file for uconv messages.
 // So you want to translate this file??? Great!
@ -30,9 +30,12 @@ root

  lcUsageWord { "usage" }
  ucUsageWord { "Usage" }
-  usage { "{0}: {1} [ -h, -?, --help ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] -f, --from-code code -t, --to-code code [ file ]\n" }
+  usage { "{0}: {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] -f, --from-code code -t, --to-code code [ file ] [ -o, --output file ]\n" }

  help {  "Options:  -h, --help                    print this message\n"
+          "          -V, --version                 print the program version\n"
+          "          -s, --silent                  suppress messages\n"
+          "          -v, --verbose                 display progress information\n"
 	  "          -l, --list                    list all available encodings\n"
 	  "          --list-code code              list only the given encoding\n"
 	  "          --default-code                list only the default encoding\n"
@ -43,8 +46,10 @@ root
 	  "          -c                            omit invalid characters from the output\n"
 	  "          --from-callback callback      use callback on original encoding\n"
 	  "          -i                            ignore invalid sequences in the input\n"
+          "          --callback callback           use callback on both encodings\n"
 	  "          -f, --from-code code          set the original encoding\n"
 	  "          -t, --to-code code            set the destination encoding\n" 
+          "          -o, --output file             write output to file\n"
 	  "\n"
 	  "Callbacks:"
 }
@ -66,6 +71,7 @@ root
  unknownCallback { "Unknown callback: {0}\n" } // 0: callback name

  cantOpenInputF  { "Couldn''t open input file {0}: {1}.\n" } // 0: file, 1: strerror [OS error string]
+  cantCreateOutputF  { "Couldn''t create output file {0}: {1}.\n" } // 0: file, 1: strerror [OS error string]

  cantWrite       { "The converted text couldn't be written: {0}.\n" } // 0: OS error string
  cantRead        { "Error reading from input file {0}.\n" } // 0: OS error string
--- a/icu4c/source/extra/uconv/uconv.1.in
+++ b/icu4c/source/extra/uconv/uconv.1.in
@ -16,6 +16,15 @@
 .BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
 ]
 [
+.BI "\-V\fP, \fB\-\-version"
+]
+[
+.BI "\-s\fP, \fB\-\-silent"
+]
+[
+.BI "\-v\fP, \fB\-\-verbose"
+]
+[
 .BI "\-l\fP, \fB\-\-list"
 |
 .BI "\-l\fP, \fB\-\-list\-code" " code"
@ -40,14 +49,24 @@
 |
 .B "\-i"
 ]
+[
+.BI "\-\-callback" " callback"
+]
 .BI "\-f\fP, \fB\-\-from\-code" " encoding"
 .BI "\-t\fP, \fB\-\-to\-code" " encoding"
 [
-.I file
+.IR file 
+]
+[
+.BI "\-o\fP, \fB\-\-output" " file"
 ]
 .SH DESCRIPTION
 .B uconv
-converts its input from one given
+converts each given
+.I file
+(or its standard input if no
+.I file
+is specified) from one
 .I encoding
 to another. The transcoding is done using Unicode as a pivot encoding
 (e.g. the data are first transcoded from their original encoding to
@ -66,6 +85,12 @@ after the data have been transcoded to Unicode.
 .BR \-h\fP, \fB\-?\fP, \fB\-\-help
 Print help about usage and exit.
 .TP
+.BI "\-s\fP, \fB\-\-silent"
+Suppress messages during execution.
+.TP
+.BI "\-v\fP, \fB\-\-verbose"
+Display extra informative messages during execution.
+.TP
 .BI "\-l\fP, \fB\-\-list"
 List all the available encodings and exit.
 .TP
@ -125,6 +150,15 @@ Ignore invalid sequences in the input.
 Same as
 .BR "\-\-from\-callback skip" .
 .TP
+.BI "\-\-callback" " callback"
+Use
+.I callback
+to handle both characters that cannot be transcoded from the original
+encoding and characters that cannot be transcoded to the destination
+encoding. See section
+.B CALLBACKS
+for details on valid callbacks.
+.TP
 .BI "\-f\fP, \fB\-\-from\-code" " encoding"
 Set the original encoding of the data to 
 .IR encoding .
@ -132,6 +166,10 @@ Set the original encoding of the data to
 .BI "\-t\fP, \fB\-\-to\-code" " encoding"
 Transcode the data to
 .IR encoding .
+.TP
+.BI "\-o\fP, \fB\-\-output" " file"
+Write the transcode data to
+.IR file .
 .SH CALLBACKS
 .B uconv
 supports specifying callbacks to handle invalid data. Callbacks can be
@ -150,19 +188,19 @@ callbacks actually supported by
 is displayed when it is called with
 .BR "\-h\fP, \fB\-\-help" .
 .PP
-.TP \w'\fBescape-xml-hex'u+3n
+.TP \w'\fBescape-codepoint'u+3n
 .B substitute
 Write the the encoding's substitute sequence, or the Unicode
 replacement character
 .B U+FFFD
 when transcoding to Unicode.
-This is the default callback.
 .TP
 .B skip
 Ignore the invalid data.
 .TP
 .B stop
 Stop with an error when encountering invalid data.
+This is the default callback.
 .TP
 .B escape
 Same as
@ -210,6 +248,14 @@ Replace the missing characters with a string of the format
 where
 .I hhhh
 is the hexadecimal value of the character.
+.TP
+.B escape-codepoint
+Replace the missing characters with a string of the format
+.BR U+\fIhhhh\fP ,
+where
+.I hhhh
+is the hexadecimal value of the character. This is the format
+universally used to denote a Unicode codepoint in the litterature.
 .SH VERSION
@VERSION@
 .SH COPYRIGHT
--- a/icu4c/source/extra/uconv/uconv.cpp
+++ b/icu4c/source/extra/uconv/uconv.cpp
@ -94,7 +94,8 @@ static struct callback_ent {
    { "escape-c", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
    { "escape-xml", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
    { "escape-xml-dec", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
-    { "escape-xml-hex", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }
+    { "escape-xml-hex", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
+    { "escape-codepoint", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_CODEPOINT, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_CODEPOINT }
 };

 static const struct callback_ent *findCallback(const char *name) {
@ -324,6 +325,8 @@ static UBool convertFile(const char* fromcpage,
    const size_t readsize = buffsize-1;
    char* buff = 0;

+    uint32_t foffset = 0;        /* Where we are in the file, for error reporting. */
+
    UConverterFromUCallback oldfromucallback;
    UConverterToUCallback oldtoucallback;
    const void *oldcontext;
@ -351,7 +354,9 @@ static UBool convertFile(const char* fromcpage,
      }

    // Create codepage converter. If the codepage or its aliases weren't
-    // available, it returns NULL and a failure code
+    // available, it returns NULL and a failure code. We also set the
+    // callbacks, and return errors in the same way.
+
    convfrom = ucnv_open(fromcpage, &err);
    if (U_FAILURE(err))
    {
@ -385,10 +390,13 @@ static UBool convertFile(const char* fromcpage,
    // To ensure that the buffer always is of enough size, we
    // must take the worst case scenario, that is the character in the codepage
    // that uses the most bytes and multiply it against the buffsize
+
    totbuffsize = buffsize * ucnv_getMaxCharSize(convto);
    buff = new char[totbuffsize];
    unibuff = new UChar[buffsize];
-        
+
+    // OK, we can convert now.
+
    do  
    {
        rd = fread(buff, 1, readsize, infile);
@ -412,7 +420,9 @@ static UBool convertFile(const char* fromcpage,
        cbuffiter = buff;
        flush = rd!=readsize;
        ucnv_toUnicode(convfrom, &uniiter, uniiter + buffsize, &cbuffiter, cbuffiter + rd, 0, flush, &err);
-            
+          
+        foffset += uniiter - unibuff;
+
        if (U_FAILURE(err))
        {
            u_wmsg("problemCvtToU", u_wmsg_errorName(err));
@ -517,29 +527,35 @@ static void usage(const char *pname, int ecode)

 int main(int argc, char** argv)
 {
-    FILE* file = 0;
-    FILE* infile;
+    FILE* infile, *outfile;
    int   ret = 0;

    const char* fromcpage = 0;
    const char* tocpage = 0;
    const char *translit = 0;
    const char* infilestr = 0;
+    const char* outfilestr = 0;

-    UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
+    UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
    const void *fromuctxt = 0;
-    UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_SUBSTITUTE;
+    UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
    const void *touctxt = 0;

    char** iter = argv+1;
    char** end = argv+argc;    

-    const char *pname = *argv;
+    const char *pname;

    int printConvs = 0, printCanon = 0;
    const char *printName = 0;
    int printTranslits = 0;

+    int silent = 0, verbose = 0;
+
+    // Prettify pname.
+    for (pname = *argv + strlen(*argv) - 1; pname != *argv && *pname != U_FILE_SEP_CHAR; --pname);
+    if (*pname == U_FILE_SEP_CHAR) ++pname;
+    
    // First, get the arguments from command-line
    // to know the codepages to convert between
    for (; iter!=end; iter++)
@ -651,7 +667,40 @@ int main(int argc, char** argv)
        else if (!strcmp("-i", *iter)) {
            toucallback = UCNV_TO_U_CALLBACK_SKIP;
        }
-        else if (**iter == '-' && (*iter)[1]) {
+        else if (!strcmp("--callback", *iter)) {
+            iter++;
+            if (iter!=end) {
+                const struct callback_ent *cbe = findCallback(*iter);
+                if (cbe) {
+                    fromucallback = cbe->fromu;
+                    fromuctxt = cbe->fromuctxt;
+                    toucallback = cbe->tou;
+                    touctxt = cbe->touctxt;
+                } else {
+                    UnicodeString str(*iter);
+                    initMsg(pname);
+                    u_wmsg("unknownCallback", str.getBuffer());
+                    return 4;
+                }
+            } else {
+                usage(pname, 1);
+            }
+        }
+        else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
+            silent = 1;
+        } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
+            verbose = 1;
+        } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
+            printf("%s v2.0\n", pname);
+            return 0;
+        } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
+            ++iter;
+            if (iter != end && !outfilestr) {
+                outfilestr = *iter;
+            } else {
+                usage(pname, 1);
+            }
+        } else if (**iter == '-' && (*iter)[1]) {
            usage(pname, 1);
        } else if (!infilestr) {
            infilestr = *iter;
@ -689,8 +738,8 @@ int main(int argc, char** argv)
    // Open the correct input file or connect to stdin for reading input
    if (infilestr!=0 && strcmp(infilestr, "-"))
    {
-        file = fopen(infilestr, "rb");
-        if (file==0)
+        infile = fopen(infilestr, "rb");
+        if (infile==0)
        {
          UnicodeString str1(infilestr,"");
          UnicodeString str2(strerror(errno),"");
@ -700,9 +749,9 @@ int main(int argc, char** argv)
                 str2.getBuffer());
          return 1;
        }
-        infile = file;
    }
    else {
+        infilestr = "-";
        infile = stdin;
 #ifdef WIN32
        if( setmode( fileno ( stdin ), O_BINARY ) == -1 ) {
@ -711,15 +760,38 @@ int main(int argc, char** argv)
        }
 #endif
    }
+
+    // Open the correct output file or connect to stdout for reading input
+    if (outfilestr!=0 && strcmp(outfilestr, "-"))
+    {
+        outfile = fopen(outfilestr, "wb");
+        if (outfile==0)
+        {
+          UnicodeString str1(outfilestr,"");
+          UnicodeString str2(strerror(errno),"");
+          initMsg(pname);
+          u_wmsg("cantCreateOutputF", 
+                 str1.getBuffer(),
+                 str2.getBuffer());
+          return 1;
+        }
+    } else {
+        outfilestr = "-";
+        outfile = stdout;
 #ifdef WIN32
-  if( setmode( fileno ( stdout ), O_BINARY ) == -1 ) {
-          perror ( "Cannot set stdout to binary mode" );
-          exit(-1);
-  }
+        if( setmode( fileno ( outfile ), O_BINARY ) == -1 ) {
+            perror ( "Cannot set output file to binary mode" );
+            exit(-1);
+        }
 #endif
+    }

  initMsg(pname);
-    if (!convertFile(fromcpage, toucallback, touctxt, tocpage, fromucallback, fromuctxt, translit, infile, stdout))
+  
+  if (verbose) {
+      fprintf(stderr, "%s:\n", infilestr);
+  }
+    if (!convertFile(fromcpage, toucallback, touctxt, tocpage, fromucallback, fromuctxt, translit, infile, outfile))
        goto error_exit;

    goto normal_exit;
@ -727,8 +799,10 @@ int main(int argc, char** argv)
    ret = 1;
  normal_exit:

-    if (file!=0)
-        fclose(file);
+    if (infile!=stdin)
+        fclose(infile);
+    if (outfile != stdout) fclose(outfile);
+
    return ret;
 }