diff --git a/icu4c/source/extra/uconv/root.txt b/icu4c/source/extra/uconv/root.txt index 6101d2e870..4493443018 100644 --- a/icu4c/source/extra/uconv/root.txt +++ b/icu4c/source/extra/uconv/root.txt @@ -1,6 +1,6 @@ // -*- Coding: utf-8; -*- [all uconv resource files] // Copyright (c) 2000 IBM, Inc. and Others. -// $Revision: 1.16 $ +// $Revision: 1.17 $ // // Root translation file for uconv messages. // So you want to translate this file??? Great! @@ -30,9 +30,12 @@ root lcUsageWord { "usage" } ucUsageWord { "Usage" } - usage { "{0}: {1} [ -h, -?, --help ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] -f, --from-code code -t, --to-code code [ file ]\n" } + usage { "{0}: {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] -f, --from-code code -t, --to-code code [ file ] [ -o, --output file ]\n" } help { "Options: -h, --help print this message\n" + " -V, --version print the program version\n" + " -s, --silent suppress messages\n" + " -v, --verbose display progress information\n" " -l, --list list all available encodings\n" " --list-code code list only the given encoding\n" " --default-code list only the default encoding\n" @@ -43,8 +46,10 @@ root " -c omit invalid characters from the output\n" " --from-callback callback use callback on original encoding\n" " -i ignore invalid sequences in the input\n" + " --callback callback use callback on both encodings\n" " -f, --from-code code set the original encoding\n" " -t, --to-code code set the destination encoding\n" + " -o, --output file write output to file\n" "\n" "Callbacks:" } @@ -66,6 +71,7 @@ root unknownCallback { "Unknown callback: {0}\n" } // 0: callback name cantOpenInputF { "Couldn''t open input file {0}: {1}.\n" } // 0: file, 1: strerror [OS error string] + cantCreateOutputF { "Couldn''t create output file {0}: {1}.\n" } // 0: file, 1: strerror [OS error string] cantWrite { "The converted text couldn't be written: {0}.\n" } // 0: OS error string cantRead { "Error reading from input file {0}.\n" } // 0: OS error string diff --git a/icu4c/source/extra/uconv/uconv.1.in b/icu4c/source/extra/uconv/uconv.1.in index 25e3287a2e..871de61dd7 100644 --- a/icu4c/source/extra/uconv/uconv.1.in +++ b/icu4c/source/extra/uconv/uconv.1.in @@ -16,6 +16,15 @@ .BR "\-h\fP, \fB\-?\fP, \fB\-\-help" ] [ +.BI "\-V\fP, \fB\-\-version" +] +[ +.BI "\-s\fP, \fB\-\-silent" +] +[ +.BI "\-v\fP, \fB\-\-verbose" +] +[ .BI "\-l\fP, \fB\-\-list" | .BI "\-l\fP, \fB\-\-list\-code" " code" @@ -40,14 +49,24 @@ | .B "\-i" ] +[ +.BI "\-\-callback" " callback" +] .BI "\-f\fP, \fB\-\-from\-code" " encoding" .BI "\-t\fP, \fB\-\-to\-code" " encoding" [ -.I file +.IR file +] +[ +.BI "\-o\fP, \fB\-\-output" " file" ] .SH DESCRIPTION .B uconv -converts its input from one given +converts each given +.I file +(or its standard input if no +.I file +is specified) from one .I encoding to another. The transcoding is done using Unicode as a pivot encoding (e.g. the data are first transcoded from their original encoding to @@ -66,6 +85,12 @@ after the data have been transcoded to Unicode. .BR \-h\fP, \fB\-?\fP, \fB\-\-help Print help about usage and exit. .TP +.BI "\-s\fP, \fB\-\-silent" +Suppress messages during execution. +.TP +.BI "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP .BI "\-l\fP, \fB\-\-list" List all the available encodings and exit. .TP @@ -125,6 +150,15 @@ Ignore invalid sequences in the input. Same as .BR "\-\-from\-callback skip" . .TP +.BI "\-\-callback" " callback" +Use +.I callback +to handle both characters that cannot be transcoded from the original +encoding and characters that cannot be transcoded to the destination +encoding. See section +.B CALLBACKS +for details on valid callbacks. +.TP .BI "\-f\fP, \fB\-\-from\-code" " encoding" Set the original encoding of the data to .IR encoding . @@ -132,6 +166,10 @@ Set the original encoding of the data to .BI "\-t\fP, \fB\-\-to\-code" " encoding" Transcode the data to .IR encoding . +.TP +.BI "\-o\fP, \fB\-\-output" " file" +Write the transcode data to +.IR file . .SH CALLBACKS .B uconv supports specifying callbacks to handle invalid data. Callbacks can be @@ -150,19 +188,19 @@ callbacks actually supported by is displayed when it is called with .BR "\-h\fP, \fB\-\-help" . .PP -.TP \w'\fBescape-xml-hex'u+3n +.TP \w'\fBescape-codepoint'u+3n .B substitute Write the the encoding's substitute sequence, or the Unicode replacement character .B U+FFFD when transcoding to Unicode. -This is the default callback. .TP .B skip Ignore the invalid data. .TP .B stop Stop with an error when encountering invalid data. +This is the default callback. .TP .B escape Same as @@ -210,6 +248,14 @@ Replace the missing characters with a string of the format where .I hhhh is the hexadecimal value of the character. +.TP +.B escape-codepoint +Replace the missing characters with a string of the format +.BR U+\fIhhhh\fP , +where +.I hhhh +is the hexadecimal value of the character. This is the format +universally used to denote a Unicode codepoint in the litterature. .SH VERSION @VERSION@ .SH COPYRIGHT diff --git a/icu4c/source/extra/uconv/uconv.cpp b/icu4c/source/extra/uconv/uconv.cpp index 5e799d6939..f5ed6b619e 100644 --- a/icu4c/source/extra/uconv/uconv.cpp +++ b/icu4c/source/extra/uconv/uconv.cpp @@ -94,7 +94,8 @@ static struct callback_ent { { "escape-c", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C }, { "escape-xml", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC }, { "escape-xml-dec", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC }, - { "escape-xml-hex", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX } + { "escape-xml-hex", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, + { "escape-codepoint", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_CODEPOINT, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_CODEPOINT } }; static const struct callback_ent *findCallback(const char *name) { @@ -324,6 +325,8 @@ static UBool convertFile(const char* fromcpage, const size_t readsize = buffsize-1; char* buff = 0; + uint32_t foffset = 0; /* Where we are in the file, for error reporting. */ + UConverterFromUCallback oldfromucallback; UConverterToUCallback oldtoucallback; const void *oldcontext; @@ -351,7 +354,9 @@ static UBool convertFile(const char* fromcpage, } // Create codepage converter. If the codepage or its aliases weren't - // available, it returns NULL and a failure code + // available, it returns NULL and a failure code. We also set the + // callbacks, and return errors in the same way. + convfrom = ucnv_open(fromcpage, &err); if (U_FAILURE(err)) { @@ -385,10 +390,13 @@ static UBool convertFile(const char* fromcpage, // To ensure that the buffer always is of enough size, we // must take the worst case scenario, that is the character in the codepage // that uses the most bytes and multiply it against the buffsize + totbuffsize = buffsize * ucnv_getMaxCharSize(convto); buff = new char[totbuffsize]; unibuff = new UChar[buffsize]; - + + // OK, we can convert now. + do { rd = fread(buff, 1, readsize, infile); @@ -412,7 +420,9 @@ static UBool convertFile(const char* fromcpage, cbuffiter = buff; flush = rd!=readsize; ucnv_toUnicode(convfrom, &uniiter, uniiter + buffsize, &cbuffiter, cbuffiter + rd, 0, flush, &err); - + + foffset += uniiter - unibuff; + if (U_FAILURE(err)) { u_wmsg("problemCvtToU", u_wmsg_errorName(err)); @@ -517,29 +527,35 @@ static void usage(const char *pname, int ecode) int main(int argc, char** argv) { - FILE* file = 0; - FILE* infile; + FILE* infile, *outfile; int ret = 0; const char* fromcpage = 0; const char* tocpage = 0; const char *translit = 0; const char* infilestr = 0; + const char* outfilestr = 0; - UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; + UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP; const void *fromuctxt = 0; - UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_SUBSTITUTE; + UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP; const void *touctxt = 0; char** iter = argv+1; char** end = argv+argc; - const char *pname = *argv; + const char *pname; int printConvs = 0, printCanon = 0; const char *printName = 0; int printTranslits = 0; + int silent = 0, verbose = 0; + + // Prettify pname. + for (pname = *argv + strlen(*argv) - 1; pname != *argv && *pname != U_FILE_SEP_CHAR; --pname); + if (*pname == U_FILE_SEP_CHAR) ++pname; + // First, get the arguments from command-line // to know the codepages to convert between for (; iter!=end; iter++) @@ -651,7 +667,40 @@ int main(int argc, char** argv) else if (!strcmp("-i", *iter)) { toucallback = UCNV_TO_U_CALLBACK_SKIP; } - else if (**iter == '-' && (*iter)[1]) { + else if (!strcmp("--callback", *iter)) { + iter++; + if (iter!=end) { + const struct callback_ent *cbe = findCallback(*iter); + if (cbe) { + fromucallback = cbe->fromu; + fromuctxt = cbe->fromuctxt; + toucallback = cbe->tou; + touctxt = cbe->touctxt; + } else { + UnicodeString str(*iter); + initMsg(pname); + u_wmsg("unknownCallback", str.getBuffer()); + return 4; + } + } else { + usage(pname, 1); + } + } + else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) { + silent = 1; + } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) { + verbose = 1; + } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) { + printf("%s v2.0\n", pname); + return 0; + } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) { + ++iter; + if (iter != end && !outfilestr) { + outfilestr = *iter; + } else { + usage(pname, 1); + } + } else if (**iter == '-' && (*iter)[1]) { usage(pname, 1); } else if (!infilestr) { infilestr = *iter; @@ -689,8 +738,8 @@ int main(int argc, char** argv) // Open the correct input file or connect to stdin for reading input if (infilestr!=0 && strcmp(infilestr, "-")) { - file = fopen(infilestr, "rb"); - if (file==0) + infile = fopen(infilestr, "rb"); + if (infile==0) { UnicodeString str1(infilestr,""); UnicodeString str2(strerror(errno),""); @@ -700,9 +749,9 @@ int main(int argc, char** argv) str2.getBuffer()); return 1; } - infile = file; } else { + infilestr = "-"; infile = stdin; #ifdef WIN32 if( setmode( fileno ( stdin ), O_BINARY ) == -1 ) { @@ -711,15 +760,38 @@ int main(int argc, char** argv) } #endif } + + // Open the correct output file or connect to stdout for reading input + if (outfilestr!=0 && strcmp(outfilestr, "-")) + { + outfile = fopen(outfilestr, "wb"); + if (outfile==0) + { + UnicodeString str1(outfilestr,""); + UnicodeString str2(strerror(errno),""); + initMsg(pname); + u_wmsg("cantCreateOutputF", + str1.getBuffer(), + str2.getBuffer()); + return 1; + } + } else { + outfilestr = "-"; + outfile = stdout; #ifdef WIN32 - if( setmode( fileno ( stdout ), O_BINARY ) == -1 ) { - perror ( "Cannot set stdout to binary mode" ); - exit(-1); - } + if( setmode( fileno ( outfile ), O_BINARY ) == -1 ) { + perror ( "Cannot set output file to binary mode" ); + exit(-1); + } #endif + } initMsg(pname); - if (!convertFile(fromcpage, toucallback, touctxt, tocpage, fromucallback, fromuctxt, translit, infile, stdout)) + + if (verbose) { + fprintf(stderr, "%s:\n", infilestr); + } + if (!convertFile(fromcpage, toucallback, touctxt, tocpage, fromucallback, fromuctxt, translit, infile, outfile)) goto error_exit; goto normal_exit; @@ -727,8 +799,10 @@ int main(int argc, char** argv) ret = 1; normal_exit: - if (file!=0) - fclose(file); + if (infile!=stdin) + fclose(infile); + if (outfile != stdout) fclose(outfile); + return ret; }