ICU-1220 Added escape-codepoint as a callback to let people use the

new UCNV_ESCAPE_CODEPOINT.
Options compatibility with iconv(1): added -V, --version; -s, --silent;
--verbose (and -v because of ICU commands); -o, --output file. uconv(1) is
now mostly compatible with iconv(1) usage, except for the fact that one
cannot convert many files in one run yet.

X-SVN-Rev: 7414
This commit is contained in:
Yves Arrouye 2002-01-09 01:04:32 +00:00
parent 85a889c4c2
commit 3b470abbe3
3 changed files with 152 additions and 26 deletions

View File

@ -1,6 +1,6 @@
// -*- Coding: utf-8; -*- [all uconv resource files]
// Copyright (c) 2000 IBM, Inc. and Others.
// $Revision: 1.16 $
// $Revision: 1.17 $
//
// Root translation file for uconv messages.
// So you want to translate this file??? Great!
@ -30,9 +30,12 @@ root
lcUsageWord { "usage" }
ucUsageWord { "Usage" }
usage { "{0}: {1} [ -h, -?, --help ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] -f, --from-code code -t, --to-code code [ file ]\n" }
usage { "{0}: {1} [ -h, -?, --help ] [ -V, --version ] [ -s, --silent ] [ -v, --verbose ] [ -l, --list | --list-code code | --default-code | -L, --list-transliterators ] [ --canon ] [ -x transliterator ] [ --to-callback callback | -c ] [ --from-callback callback | -i ] [ --callback callback ] -f, --from-code code -t, --to-code code [ file ] [ -o, --output file ]\n" }
help { "Options: -h, --help print this message\n"
" -V, --version print the program version\n"
" -s, --silent suppress messages\n"
" -v, --verbose display progress information\n"
" -l, --list list all available encodings\n"
" --list-code code list only the given encoding\n"
" --default-code list only the default encoding\n"
@ -43,8 +46,10 @@ root
" -c omit invalid characters from the output\n"
" --from-callback callback use callback on original encoding\n"
" -i ignore invalid sequences in the input\n"
" --callback callback use callback on both encodings\n"
" -f, --from-code code set the original encoding\n"
" -t, --to-code code set the destination encoding\n"
" -o, --output file write output to file\n"
"\n"
"Callbacks:"
}
@ -66,6 +71,7 @@ root
unknownCallback { "Unknown callback: {0}\n" } // 0: callback name
cantOpenInputF { "Couldn''t open input file {0}: {1}.\n" } // 0: file, 1: strerror [OS error string]
cantCreateOutputF { "Couldn''t create output file {0}: {1}.\n" } // 0: file, 1: strerror [OS error string]
cantWrite { "The converted text couldn't be written: {0}.\n" } // 0: OS error string
cantRead { "Error reading from input file {0}.\n" } // 0: OS error string

View File

@ -16,6 +16,15 @@
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
]
[
.BI "\-V\fP, \fB\-\-version"
]
[
.BI "\-s\fP, \fB\-\-silent"
]
[
.BI "\-v\fP, \fB\-\-verbose"
]
[
.BI "\-l\fP, \fB\-\-list"
|
.BI "\-l\fP, \fB\-\-list\-code" " code"
@ -40,14 +49,24 @@
|
.B "\-i"
]
[
.BI "\-\-callback" " callback"
]
.BI "\-f\fP, \fB\-\-from\-code" " encoding"
.BI "\-t\fP, \fB\-\-to\-code" " encoding"
[
.I file
.IR file
]
[
.BI "\-o\fP, \fB\-\-output" " file"
]
.SH DESCRIPTION
.B uconv
converts its input from one given
converts each given
.I file
(or its standard input if no
.I file
is specified) from one
.I encoding
to another. The transcoding is done using Unicode as a pivot encoding
(e.g. the data are first transcoded from their original encoding to
@ -66,6 +85,12 @@ after the data have been transcoded to Unicode.
.BR \-h\fP, \fB\-?\fP, \fB\-\-help
Print help about usage and exit.
.TP
.BI "\-s\fP, \fB\-\-silent"
Suppress messages during execution.
.TP
.BI "\-v\fP, \fB\-\-verbose"
Display extra informative messages during execution.
.TP
.BI "\-l\fP, \fB\-\-list"
List all the available encodings and exit.
.TP
@ -125,6 +150,15 @@ Ignore invalid sequences in the input.
Same as
.BR "\-\-from\-callback skip" .
.TP
.BI "\-\-callback" " callback"
Use
.I callback
to handle both characters that cannot be transcoded from the original
encoding and characters that cannot be transcoded to the destination
encoding. See section
.B CALLBACKS
for details on valid callbacks.
.TP
.BI "\-f\fP, \fB\-\-from\-code" " encoding"
Set the original encoding of the data to
.IR encoding .
@ -132,6 +166,10 @@ Set the original encoding of the data to
.BI "\-t\fP, \fB\-\-to\-code" " encoding"
Transcode the data to
.IR encoding .
.TP
.BI "\-o\fP, \fB\-\-output" " file"
Write the transcode data to
.IR file .
.SH CALLBACKS
.B uconv
supports specifying callbacks to handle invalid data. Callbacks can be
@ -150,19 +188,19 @@ callbacks actually supported by
is displayed when it is called with
.BR "\-h\fP, \fB\-\-help" .
.PP
.TP \w'\fBescape-xml-hex'u+3n
.TP \w'\fBescape-codepoint'u+3n
.B substitute
Write the the encoding's substitute sequence, or the Unicode
replacement character
.B U+FFFD
when transcoding to Unicode.
This is the default callback.
.TP
.B skip
Ignore the invalid data.
.TP
.B stop
Stop with an error when encountering invalid data.
This is the default callback.
.TP
.B escape
Same as
@ -210,6 +248,14 @@ Replace the missing characters with a string of the format
where
.I hhhh
is the hexadecimal value of the character.
.TP
.B escape-codepoint
Replace the missing characters with a string of the format
.BR U+\fIhhhh\fP ,
where
.I hhhh
is the hexadecimal value of the character. This is the format
universally used to denote a Unicode codepoint in the litterature.
.SH VERSION
@VERSION@
.SH COPYRIGHT

View File

@ -94,7 +94,8 @@ static struct callback_ent {
{ "escape-c", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
{ "escape-xml", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
{ "escape-xml-dec", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
{ "escape-xml-hex", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }
{ "escape-xml-hex", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
{ "escape-codepoint", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_CODEPOINT, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_CODEPOINT }
};
static const struct callback_ent *findCallback(const char *name) {
@ -324,6 +325,8 @@ static UBool convertFile(const char* fromcpage,
const size_t readsize = buffsize-1;
char* buff = 0;
uint32_t foffset = 0; /* Where we are in the file, for error reporting. */
UConverterFromUCallback oldfromucallback;
UConverterToUCallback oldtoucallback;
const void *oldcontext;
@ -351,7 +354,9 @@ static UBool convertFile(const char* fromcpage,
}
// Create codepage converter. If the codepage or its aliases weren't
// available, it returns NULL and a failure code
// available, it returns NULL and a failure code. We also set the
// callbacks, and return errors in the same way.
convfrom = ucnv_open(fromcpage, &err);
if (U_FAILURE(err))
{
@ -385,10 +390,13 @@ static UBool convertFile(const char* fromcpage,
// To ensure that the buffer always is of enough size, we
// must take the worst case scenario, that is the character in the codepage
// that uses the most bytes and multiply it against the buffsize
totbuffsize = buffsize * ucnv_getMaxCharSize(convto);
buff = new char[totbuffsize];
unibuff = new UChar[buffsize];
// OK, we can convert now.
do
{
rd = fread(buff, 1, readsize, infile);
@ -413,6 +421,8 @@ static UBool convertFile(const char* fromcpage,
flush = rd!=readsize;
ucnv_toUnicode(convfrom, &uniiter, uniiter + buffsize, &cbuffiter, cbuffiter + rd, 0, flush, &err);
foffset += uniiter - unibuff;
if (U_FAILURE(err))
{
u_wmsg("problemCvtToU", u_wmsg_errorName(err));
@ -517,29 +527,35 @@ static void usage(const char *pname, int ecode)
int main(int argc, char** argv)
{
FILE* file = 0;
FILE* infile;
FILE* infile, *outfile;
int ret = 0;
const char* fromcpage = 0;
const char* tocpage = 0;
const char *translit = 0;
const char* infilestr = 0;
const char* outfilestr = 0;
UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE;
UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
const void *fromuctxt = 0;
UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_SUBSTITUTE;
UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
const void *touctxt = 0;
char** iter = argv+1;
char** end = argv+argc;
const char *pname = *argv;
const char *pname;
int printConvs = 0, printCanon = 0;
const char *printName = 0;
int printTranslits = 0;
int silent = 0, verbose = 0;
// Prettify pname.
for (pname = *argv + strlen(*argv) - 1; pname != *argv && *pname != U_FILE_SEP_CHAR; --pname);
if (*pname == U_FILE_SEP_CHAR) ++pname;
// First, get the arguments from command-line
// to know the codepages to convert between
for (; iter!=end; iter++)
@ -651,7 +667,40 @@ int main(int argc, char** argv)
else if (!strcmp("-i", *iter)) {
toucallback = UCNV_TO_U_CALLBACK_SKIP;
}
else if (**iter == '-' && (*iter)[1]) {
else if (!strcmp("--callback", *iter)) {
iter++;
if (iter!=end) {
const struct callback_ent *cbe = findCallback(*iter);
if (cbe) {
fromucallback = cbe->fromu;
fromuctxt = cbe->fromuctxt;
toucallback = cbe->tou;
touctxt = cbe->touctxt;
} else {
UnicodeString str(*iter);
initMsg(pname);
u_wmsg("unknownCallback", str.getBuffer());
return 4;
}
} else {
usage(pname, 1);
}
}
else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
silent = 1;
} else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
verbose = 1;
} else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
printf("%s v2.0\n", pname);
return 0;
} else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
++iter;
if (iter != end && !outfilestr) {
outfilestr = *iter;
} else {
usage(pname, 1);
}
} else if (**iter == '-' && (*iter)[1]) {
usage(pname, 1);
} else if (!infilestr) {
infilestr = *iter;
@ -689,8 +738,8 @@ int main(int argc, char** argv)
// Open the correct input file or connect to stdin for reading input
if (infilestr!=0 && strcmp(infilestr, "-"))
{
file = fopen(infilestr, "rb");
if (file==0)
infile = fopen(infilestr, "rb");
if (infile==0)
{
UnicodeString str1(infilestr,"");
UnicodeString str2(strerror(errno),"");
@ -700,9 +749,9 @@ int main(int argc, char** argv)
str2.getBuffer());
return 1;
}
infile = file;
}
else {
infilestr = "-";
infile = stdin;
#ifdef WIN32
if( setmode( fileno ( stdin ), O_BINARY ) == -1 ) {
@ -711,15 +760,38 @@ int main(int argc, char** argv)
}
#endif
}
// Open the correct output file or connect to stdout for reading input
if (outfilestr!=0 && strcmp(outfilestr, "-"))
{
outfile = fopen(outfilestr, "wb");
if (outfile==0)
{
UnicodeString str1(outfilestr,"");
UnicodeString str2(strerror(errno),"");
initMsg(pname);
u_wmsg("cantCreateOutputF",
str1.getBuffer(),
str2.getBuffer());
return 1;
}
} else {
outfilestr = "-";
outfile = stdout;
#ifdef WIN32
if( setmode( fileno ( stdout ), O_BINARY ) == -1 ) {
perror ( "Cannot set stdout to binary mode" );
if( setmode( fileno ( outfile ), O_BINARY ) == -1 ) {
perror ( "Cannot set output file to binary mode" );
exit(-1);
}
#endif
}
initMsg(pname);
if (!convertFile(fromcpage, toucallback, touctxt, tocpage, fromucallback, fromuctxt, translit, infile, stdout))
if (verbose) {
fprintf(stderr, "%s:\n", infilestr);
}
if (!convertFile(fromcpage, toucallback, touctxt, tocpage, fromucallback, fromuctxt, translit, infile, outfile))
goto error_exit;
goto normal_exit;
@ -727,8 +799,10 @@ int main(int argc, char** argv)
ret = 1;
normal_exit:
if (file!=0)
fclose(file);
if (infile!=stdin)
fclose(infile);
if (outfile != stdout) fclose(outfile);
return ret;
}