From 62bfc593304bb08fe512c4c86bc343b372a7e863 Mon Sep 17 00:00:00 2001 From: Helena Chapman Date: Sat, 8 Jan 2000 00:51:44 +0000 Subject: [PATCH] ICU-216 Added fixFileSeparator and isAmbiguous in UnicodeConverterCPP class and ucnv_xxx interface. X-SVN-Rev: 497 --- icu4c/source/common/convert.cpp | 41 +++++++++++++++++++ icu4c/source/common/ucnv.c | 55 +++++++++++++++++++++++++- icu4c/source/common/unicode/convert.h | 19 +++++++++ icu4c/source/common/unicode/ucnv.h | 20 ++++++++++ icu4c/source/common/unicode/ucnv_bld.h | 17 ++++++++ 5 files changed, 150 insertions(+), 2 deletions(-) diff --git a/icu4c/source/common/convert.cpp b/icu4c/source/common/convert.cpp index d56af9acc9..d24106ace6 100644 --- a/icu4c/source/common/convert.cpp +++ b/icu4c/source/common/convert.cpp @@ -433,3 +433,44 @@ int32_t UnicodeConverterCPP::flushCache() { return ucnv_flushCache(); } + +/* HSYS: To be cleaned up. The usage of UChar* and UnicodeString in +the C++ APIs need to be revisited. */ +void UnicodeConverterCPP::fixFileSeparator(UnicodeString& source) const +{ + int32_t i = 0; + int32_t index = 0; + int32_t ccsid = 0; + UErrorCode status = U_ZERO_ERROR; + if (source.length() == 0) + { + return; + } + ccsid = getCodepage(status); + if (U_FAILURE(status)) + { + return; + } + for (i = 0; i < UCNV_MAX_AMBIGUOUSCCSIDS; i++) { + if (ccsid == UCNV_AMBIGUOUSCONVERTERS[i].ccsid) + { + index = i; + break; + } + } + if (index != -1) + { + for (i = 0; i < source.length(); i++) + { + if (source[i] == UCNV_AMBIGUOUSCONVERTERS[index].mismapped) + { + source[i] = UCNV_AMBIGUOUSCONVERTERS[index].replacement; + } + } + } +} + +bool_t UnicodeConverterCPP::isAmbiguous(void) const +{ + return ucnv_isAmbiguous(myUnicodeConverter); +} \ No newline at end of file diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c index 92dfbb26d6..46fb090128 100644 --- a/icu4c/source/common/ucnv.c +++ b/icu4c/source/common/ucnv.c @@ -38,6 +38,9 @@ #define CHUNK_SIZE 5*1024 +/* Internal function : begin */ +static int32_t ucnv_getAmbiguousCCSID (const UConverter* cnv); +/* Internal function : end */ typedef void (*T_ToUnicodeFunction) (UConverter *, UChar **, @@ -130,7 +133,6 @@ static T_GetNextUCharFunction GET_NEXT_UChar_FUNCTIONS[UCNV_NUMBER_OF_SUPPORTED_ T_UConverter_getNextUChar_ISO_2022 }; - void flushInternalUnicodeBuffer (UConverter * _this, UChar * myTarget, int32_t * myTargetIndex, @@ -512,7 +514,6 @@ UConverterFromUCallback ucnv_setFromUCallBack (UConverter * converter, return myReturn; } -#include void ucnv_fromUnicode (UConverter * _this, char **target, const char *targetLimit, @@ -1154,3 +1155,53 @@ void ucnv_getStarters(const UConverter* converter, uprv_memcpy(starters, converter->sharedData->table->mbcs.starters, 256*sizeof(bool_t)); return; } + +int32_t ucnv_getAmbiguousCCSID(const UConverter *cnv) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t i = 0; + int32_t ccsid = 0; + if (cnv == NULL) + { + return -1; + } + ccsid = ucnv_getCCSID(cnv, &status); + if (U_FAILURE(status)) + { + return -1; + } + for (i = 0; i < UCNV_MAX_AMBIGUOUSCCSIDS; i++) { + if (ccsid == UCNV_AMBIGUOUSCONVERTERS[i].ccsid) + { + return i; + } + } + return -1; +} + +void ucnv_fixFileSeparator(const UConverter *cnv, + UChar* source, + int32_t sourceLength) +{ + int32_t i = 0; + int32_t index = 0; + if ((source == NULL) || (cnv == NULL)) + { + return; + } + if ((index = ucnv_getAmbiguousCCSID(cnv)) != -1) + { + for (i = 0; i < sourceLength; i++) + { + if (source[i] == UCNV_AMBIGUOUSCONVERTERS[index].mismapped) + { + source[i] = UCNV_AMBIGUOUSCONVERTERS[index].replacement; + } + } + } +} + +bool_t ucnv_isAmbiguous(const UConverter *cnv) +{ + return (ucnv_getAmbiguousCCSID(cnv) == -1 ? FALSE : TRUE); +} diff --git a/icu4c/source/common/unicode/convert.h b/icu4c/source/common/unicode/convert.h index b1a7704648..2e09c70b7e 100644 --- a/icu4c/source/common/unicode/convert.h +++ b/icu4c/source/common/unicode/convert.h @@ -321,5 +321,24 @@ static const char* const* getAvailableNames(int32_t& num, * @return the number of cached converters successfully deleted */ static int32_t flushCache(void); +/** + * Fixes the backslash character mismapping. For example, in SJIS, the backslash + * character in the ASCII portion is also used to represent the yen currency sign. + * When mapping from Unicode character 0x005C, it's unclear whether to map the + * character back to yen or backslash in SJIS. This function will take the input + * buffer and replace all the yen sign characters with backslash. This is necessary + * when the user tries to open a file with the input buffer on Windows. + * @param source the input buffer to be fixed + */ +void fixFileSeparator(UnicodeString& source) const; + +/** + * Determines if the converter contains ambiguous mappings of the same + * character or not. + * @return TRUE if the converter contains ambiguous mapping of the same + * character, FALSE otherwise. + */ +bool_t isAmbiguous(void) const; + }; #endif diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index 15ffbd4bca..ee3b9cbc63 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -600,6 +600,26 @@ U_CAPI const char * U_EXPORT2 ucnv_getDefaultName (void); */ U_CAPI void U_EXPORT2 ucnv_setDefaultName (const char *name); +/** + * Fixes the backslash character mismapping. For example, in SJIS, the backslash + * character in the ASCII portion is also used to represent the yen currency sign. + * When mapping from Unicode character 0x005C, it's unclear whether to map the + * character back to yen or backslash in SJIS. This function will take the input + * buffer and replace all the yen sign characters with backslash. This is necessary + * when the user tries to open a file with the input buffer on Windows. + * @param source the input buffer to be fixed + * @param sourceLength the length of the input buffer + */ +U_CAPI void U_EXPORT2 ucnv_fixFileSeparator(const UConverter *cnv, UChar* source, int32_t sourceLen); + +/** + * Determines if the converter contains ambiguous mappings of the same + * character or not. + * @return TRUE if the converter contains ambiguous mapping of the same + * character, FALSE otherwise. + */ +U_CAPI bool_t U_EXPORT2 ucnv_isAmbiguous(const UConverter *cnv); + #endif /*_UCNV*/ diff --git a/icu4c/source/common/unicode/ucnv_bld.h b/icu4c/source/common/unicode/ucnv_bld.h index 0fbacc1776..36ab329a9c 100644 --- a/icu4c/source/common/unicode/ucnv_bld.h +++ b/icu4c/source/common/unicode/ucnv_bld.h @@ -24,6 +24,7 @@ #define UCNV_MAX_SUBCHAR_LEN 4 #define UCNV_ERROR_BUFFER_LENGTH 20 +#define UCNV_MAX_AMBIGUOUSCCSIDS 5 #ifndef UCMP16_H typedef struct _CompactShortArray CompactShortArray; @@ -66,6 +67,22 @@ typedef enum { UCNV_GB = 11 } UConverterType; +typedef struct +{ + int32_t ccsid; + UChar mismapped; + UChar replacement; +} UAmbiguousConverter; + +static const UAmbiguousConverter UCNV_AMBIGUOUSCONVERTERS[UCNV_MAX_AMBIGUOUSCCSIDS] = +{ + 943, 0x00A5, 0x005C, + 949, 0x20A9, 0x005C, + 1361, 0x20A9, 0x005C, + 942, 0x00A5, 0x005C, + 1363, 0x20A9, 0x005C +}; + typedef enum { UCNV_UNKNOWN = -1, UCNV_IBM = 0