/****************************************************************************** * * Copyright (C) 1999-2000, International Business Machines * Corporation and others. All Rights Reserved. * *******************************************************************************/ // // uconv demonstration example of ICU and codepage conversion // Purpose is to be a similar tool as the UNIX iconv program. // Shows the usage of the ICU classes: UnicodeConverter, UnicodeString // // Usage: uconv [flag] [file] // -f [codeset] Convert file from this codeset // -t [codeset] Convert file to this code set // -l Display all available converters // -x [transliterator] Run everything through a transliterator // -L Display all available transliterators // If no file is given, uconv tries to read from stdin // // To compile: c++ -o uconv -I${ICUHOME}/include -Wall -g uconv.cpp -L${ICUHOME}/lib -licu-uc -licu-i18n // // Original contributor was Jonas Utterström in 1999 // Permission is granted to use, copy, modify, and distribute this software // #include #include #include #include // This is the UnicodeConverter headerfile #include "unicode/convert.h" // This is the UnicodeString headerfile #include "unicode/unistr.h" // Our message printer.. #include "unicode/uwmsg.h" #ifdef WIN32 #include #include #include #endif #ifdef USE_TRANSLIT # include "unicode/translit.h" #endif static const size_t buffsize = 4096; // Print all available codepage converters static void printAllConverters() { UErrorCode err = U_ZERO_ERROR; int32_t num; size_t numprint = 0; static const size_t maxline = 70; // getAvailable returns a string-table with all available codepages const char* const* convtable = UnicodeConverter::getAvailableNames(num, err); if (U_FAILURE(err)) { u_wmsg("cantGetNames", u_wmsg_errorName(err)); return; } for (int32_t i = 0; imaxline) { putchar('\n'); numprint = 0; } } puts(convtable[num-1]); } // Convert a file from one encoding to another static UBool convertFile(const char* fromcpage, const char* tocpage, FILE* infile, FILE* outfile) { UBool ret = TRUE; UnicodeConverter* convfrom = 0; UnicodeConverter* convto = 0; UErrorCode err = U_ZERO_ERROR; UBool flush; const char* cbuffiter; char* buffiter; const size_t readsize = buffsize-1; char* buff = 0; const UChar* cuniiter; UChar* uniiter; UChar* unibuff = 0; size_t rd, totbuffsize; #if USE_TRANSLIT const char *translit; Transliterator *t = NULL; translit = getenv("TRANSLIT"); if(translit != NULL && *translit) { t = Transliterator::createInstance(UnicodeString(translit, "")); fprintf(stderr, "Opening transliterator: %s\n", translit, t); } #endif // Create codepage converter. If the codepage or its aliases weren't // available, it returns NULL and a failure code convfrom = new UnicodeConverter(fromcpage, err); if (U_FAILURE(err)) { UnicodeString str(fromcpage,""); u_wmsg("cantOpenFromCodeset",str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } convto = new UnicodeConverter(tocpage, err); if (U_FAILURE(err)) { UnicodeString str(tocpage,""); u_wmsg("cantOpenToCodeset",str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } // To ensure that the buffer always is of enough size, we // must take the worst case scenario, that is the character in the codepage // that uses the most bytes and multiply it against the buffsize totbuffsize = buffsize*convto->getMaxBytesPerChar(); buff = new char[totbuffsize]; unibuff = new UChar[buffsize]; do { rd = fread(buff, 1, readsize, infile); if (ferror(infile) != 0) { UnicodeString str(strerror(errno), ""); u_wmsg("cantRead",str.getBuffer()); goto error_exit; } // Convert the read buffer into the new coding // After the call 'uniiter' will be placed on the last character that was converted // in the 'unibuff'. // Also the 'cbuffiter' is positioned on the last converted character. // At the last conversion in the file, flush should be set to true so that // we get all characters converted // // The converter must be flushed at the end of conversion so that characters // on hold also will be written uniiter = unibuff; cbuffiter = buff; flush = rd!=readsize; convfrom->toUnicode(uniiter, uniiter+buffsize, cbuffiter, cbuffiter+rd, NULL, flush, err); if (U_FAILURE(err)) { u_wmsg("problemCvtToU", u_wmsg_errorName(err)); goto error_exit; } // At the last conversion, the converted characters should be equal to number // of chars read. if (flush && cbuffiter!=(buff+rd)) { u_wmsg("premEndInput"); goto error_exit; } // Convert the Unicode buffer into the destination codepage // Again 'buffiter' will be placed on the last converted character // And 'cuniiter' will be placed on the last converted unicode character // At the last conversion flush should be set to true to ensure that // all characters left get converted UnicodeString u(unibuff, uniiter-unibuff); buffiter = buff; cuniiter = unibuff; #ifdef USE_TRANSLIT if(t) { t->transliterate(u); u.extract(0, u.length(), unibuff, 0); uniiter = unibuff + u.length(); } #endif convto->fromUnicode(buffiter, buffiter+totbuffsize, cuniiter, cuniiter+(size_t)(uniiter-unibuff), NULL, flush, err); if (U_FAILURE(err)) { u_wmsg("problemCvtFromU", u_wmsg_errorName(err)); goto error_exit; } // At the last conversion, the converted characters should be equal to number // of consumed characters. if (flush && cuniiter!=(unibuff+(size_t)(uniiter-unibuff))) { u_wmsg("premEnd"); goto error_exit; } // Finally, write the converted buffer to the output file rd = (size_t)(buffiter-buff); if (fwrite(buff, 1, rd, outfile) != rd) { UnicodeString str(strerror(errno),""); u_wmsg("cantWrite", str.getBuffer()); goto error_exit; } } while (!flush); // Stop when we have flushed the converters (this means that it's the end of output) goto normal_exit; error_exit: ret = TRUE; normal_exit: if (convfrom) delete convfrom; if (convto) delete convto; #ifdef USE_TRANSLIT if ( t ) delete t; #endif // Close the created converters if (buff) delete [] buff; if (unibuff) delete [] unibuff; return ret; } static UResourceBundle *gBundle = 0; static void initMsg(const char *pname) { static int ps = 0; if (!ps) { char dataPath[500]; UErrorCode err = U_ZERO_ERROR; ps = 1; /* Get messages. */ strcpy(dataPath, u_getDataDirectory()); strcat(dataPath, "uconvmsg"); gBundle = u_wmsg_setPath(dataPath, &err); if(U_FAILURE(err)) { fprintf(stderr, "%s: warning: couldn't open resource bundle %s: %s\n", pname, dataPath, u_errorName(err)); } } } static void usage(const char *pname, int ecode) { const UChar *msg; int32_t msgLen; UErrorCode err = U_ZERO_ERROR; initMsg(pname); msg = ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", &msgLen, &err); UnicodeString upname(pname); UnicodeString mname(msg, msgLen); u_wmsg("usage", mname.getBuffer(), upname.getBuffer()); if (!ecode) { putchar('\n'); u_wmsg("help"); } exit(ecode); } int main(int argc, char** argv) { FILE* file = 0; FILE* infile; int ret = 0; const char* fromcpage = 0; const char* tocpage = 0; const char* infilestr = 0; char** iter = argv+1; char** end = argv+argc; const char *pname = *argv; // First, get the arguments from command-line // to know the codepages to convert between for (; iter!=end; iter++) { // Check for from charset if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) { iter++; if (iter!=end) fromcpage = *iter; } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) { iter++; if (iter!=end) tocpage = *iter; } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) { printAllConverters(); goto normal_exit; } else if (strcmp("-h", *iter) == 0 || !strcmp("--help", *iter)) { usage(pname, 0); } else if (**iter == '-' && (*iter)[1]) { usage(pname, 1); } else if (!infilestr) { infilestr = *iter; } else { usage(pname, 1); } } if (fromcpage==0 && tocpage==0) { usage(pname, 1); } if (fromcpage==0) { initMsg(pname); u_wmsg("noFromCodeset"); //"No conversion from codeset given (use -f)\n"); goto error_exit; } if (tocpage==0) { initMsg(pname); u_wmsg("noToCodeset"); // "No conversion to codeset given (use -t)\n"); goto error_exit; } // Open the correct input file or connect to stdin for reading input if (infilestr!=0 && strcmp(infilestr, "-")) { file = fopen(infilestr, "rb"); if (file==0) { UnicodeString str1(infilestr,""); UnicodeString str2(strerror(errno),""); initMsg(pname); u_wmsg("cantOpenInputF", str1.getBuffer(), str2.getBuffer()); return 1; } infile = file; } else { infile = stdin; #ifdef WIN32 if( setmode( fileno ( stdin ), O_BINARY ) == -1 ) { perror ( "Cannot set stdin to binary mode" ); exit(-1); } #endif } #ifdef WIN32 if( setmode( fileno ( stdout ), O_BINARY ) == -1 ) { perror ( "Cannot set stdout to binary mode" ); exit(-1); } #endif if (!convertFile(fromcpage, tocpage, infile, stdout)) goto error_exit; goto normal_exit; error_exit: ret = 1; normal_exit: if (file!=0) fclose(file); return ret; }