scuffed-code/icu4c/source/common/unicode/convert.h

/*******************************************************************************
 *
 *   Copyright (C) 1998-1999, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *
 *   Change history:
 *
 *   06/29/2000  helena      Major rewrite of the callback APIs.
 *******************************************************************************/

#ifndef CONVERT_H
#define CONVERT_H


#include "unicode/unistr.h"
#include "unicode/ucnv.h"
/**
 * UnicodeConverter is a C++ wrapper class for UConverter.
 * You need one UnicodeConverter object in place of one UConverter object.
 * For details on the API and implementation of the
 * codepage converter iterface see ucnv.h.
 *
 * @see UConverter
 * @stable
 */

class U_COMMON_API UnicodeConverter
{
 private:
  /*Internal Data representation of the Converter*/
  UConverter* myUnicodeConverter;
  /*Debug method*/
  void printRef(void) const;

  /* list of converter and alias names */
  static const char **availableConverterNames;
  static int32_t availableConverterNamesCount;

 public:

//Constructors and a destructor

  /**
 * Creates Unicode Conversion Object will default to LATIN1 <-> encoding
 * @return An object Handle if successful or a NULL if the creation failed
 * @stable
 */
 UnicodeConverter();

/**
 * Creates Unicode Conversion Object by specifying the codepage name.  The name
 * string is in ASCII format.
 * @param code_set the pointer to a char[] object containing a codepage name. (I)
 * @param UErrorCode Error status (I/O) IILLEGAL_ARGUMENT_ERROR will be returned if the string is empty.
 * If the internal program does not work correctly, for example, if there's no such codepage,
 * U_INTERNAL_PROGRAM_ERROR will be returned.
 * @return An object Handle if successful or a NULL if the creation failed
 * @stable
 */
 UnicodeConverter(const char*             name,
             UErrorCode&              err);

 /**
  *Creates a UnicodeConverter object with the names specified as unicode strings. The name should be limited to
  *the ASCII-7 alphanumerics. Dash and underscore characters are allowed for readability, but are ignored in the
  *search.
  *@param code_set name of the uconv table in Unicode string (I)
  *@param err error status (I/O) IILLEGAL_ARGUMENT_ERROR will be returned if the string is empty.  If the internal
  *program does not work correctly, for example, if there's no such codepage, U_INTERNAL_PROGRAM_ERROR will be
  *returned.
  *@return the created Unicode converter object
  * @stable
  */
 UnicodeConverter(const UnicodeString&    name,
             UErrorCode&              err);

 /**
  * Creates Unicode Conversion Object using the codepage ID number.
  * @param code_set a codepage # (I)
  * @UErrorCode Error status (I/O) IILLEGAL_ARGUMENT_ERROR will be returned if the string is empty.
  * If the internal program does not work correctly, for example, if there's no such codepage,
  * U_INTERNAL_PROGRAM_ERROR will be returned.
  * @return An object Handle if successful or a NULL if failed
  * @stable
  */
 UnicodeConverter(int32_t                      codepageNumber,
             UConverterPlatform  platform,
             UErrorCode&                   err);

 ~UnicodeConverter();


 /**
  * Transcodes the source UnicodeString to the target string in a codepage encoding
  * with the specified Unicode converter.  For example, if a Unicode to/from JIS
  * converter is specified, the source string in Unicode will be transcoded to JIS
  * encoding.  The result will be stored in JIS encoding.
  *
  * @param source the source Unicode string
  * @param target the target string in codepage encoding
  * @param targetSize Input the number of bytes available in the "target" buffer, Output the number of bytes copied to it
  * @param err the error status code.  U_MEMORY_ALLOCATION_ERROR will be returned if the
  * the internal process buffer cannot be allocated for transcoding.  U_ILLEGAL_ARGUMENT_ERROR
  * is returned if the converter is null or the source or target string is empty.
  * @draft backslash versus Yen sign in shift-JIS
  */
void fromUnicodeString(char*                    target,
               int32_t&                 targetSize,
               const UnicodeString&     source,
               UErrorCode&               err) const;

/**
 * Transcode the source string in codepage encoding to the target string in
 * Unicode encoding.  For example, if a Unicode to/from JIS
 * converter is specified, the source string in JIS encoding will be transcoded
 * to Unicode encoding.  The result will be stored in Unicode encoding.
 * @param source the source string in codepage encoding
 * @param target the target string in Unicode encoding
 * @param targetSize : I/O parameter, Input size buffer, Output # of bytes copied to it
 * @param err the error status code U_MEMORY_ALLOCATION_ERROR will be returned if the
 * the internal process buffer cannot be allocated for transcoding.  U_ILLEGAL_ARGUMENT_ERROR
 * is returned if the converter is null or the source or target string is empty.
 * @stable
 */
void  toUnicodeString(UnicodeString&    target,
                      const char*       source,
                      int32_t           sourceSize,
                      UErrorCode&        err) const;

/**
 * Transcodes an array of unicode characters to an array of codepage characters.
 * The source pointer is an I/O parameter, it starts out pointing at the place
 * to begin translating, and ends up pointing after the first sequence of the bytes
 * that it encounters that are semantically invalid.
 * if T_UnicodeConverter_setMissingCharAction is called with an action other than STOP
 * before a call is made to this API, consumed and source should point to the same place
 * (unless target ends with an imcomplete sequence of bytes and flush is FALSE).
 * @param target : I/O parameter. Input : Points to the beginning of the buffer to copy
 *  codepage characters to. Output : points to after the last codepage character copied
 *  to target.
 * @param targetLimit the pointer to the end of the target array
 * @param source the source Unicode character array
 * @param sourceLimit the pointer to the end of the source array
 * @param flush TRUE if the buffer is the last buffer and the conversion will finish
 * in this call, FALSE otherwise.  (future feature pending)
 * @param UErrorCode the error status.  U_ILLEGAL_ARGUMENT_ERROR will be returned if the
 * converter is null.
 * @draft backslash versus Yen sign in shift-JIS
 */
void fromUnicode(char*&         target,
                 const char*    targetLimit,
                 const UChar*&      source,
                 const UChar* sourceLimit,
                 int32_t * offsets,
                 UBool         flush,
                 UErrorCode&     err);


/**
 * Converts an array of codepage characters into an array of unicode characters.
 * The source pointer is an I/O parameter, it starts out pointing at the place
 * to begin translating, and ends up pointing after the first sequence of the bytes
 * that it encounters that are semantically invalid.
 * if T_UnicodeConverter_setMissingUnicodeAction is called with an action other than STOP
 * before a call is made to this API, consumed and source should point to the same place
 * (unless target ends with an imcomplete sequence of bytes and flush is FALSE).
 * @param target : I/O parameter. Input : Points to the beginning of the buffer to copy
 *  Unicode characters to. Output : points to after the last UChar copied to target.
 * @param targetLimit the pointer to the end of the target array
 * @param source the source codepage character array
 * @param sourceLimit the pointer to the end of the source array
 * @param flush TRUE if the buffer is the last buffer and the conversion will finish
 * in this call, FALSE otherwise.  (future feature pending)
 * @param err the error code status  U_ILLEGAL_ARGUMENT_ERROR will be returned if the
 * converter is null, targetLimit < target, sourceLimit < source
 * @stable
 */
 void toUnicode(UChar*&        target,
        const UChar*   targetLimit,
        const char*&     source,
        const char*      sourceLimit,
        int32_t * offsets,
        UBool           flush,
        UErrorCode&       err);


/**
 * Returns the maximum length of bytes used by a character. This varies between 1 and 4
 * @return the max number of bytes per codepage character  * converter is null, targetLimit < target, sourceLimit < source
 * @stable
 */
int8_t getMaxBytesPerChar(void) const;

/**
* Returns the minimum byte length for characters in this codepage. This is either
* 1 or 2 for all supported codepages.
* @return the minimum number of byte per codepage character
* @stable
*/
int8_t getMinBytesPerChar(void) const;

/**
 *Gets the type of conversion associated with the converter
 * e.g. SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022, EBCDIC_STATEFUL, LATIN_1
 * @return the type of the converter
 * @stable
 */
UConverterType getType(void) const;

/**
 *Gets the "starter" bytes for the converters of type MBCS
 *will fill in an <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> if converter passed in
 *is not MBCS.
 *fills in an array of boolean, with the value of the byte as offset to the array.
 *At return, if TRUE is found in at offset 0x20, it means that the byte 0x20 is a starter byte
 *in this converter.
 * @param starters: an array of size 256 to be filled in
 * @param err: an array of size 256 to be filled in
 * @see ucnv_getType
 * @stable
 */
 void getStarters(UBool starters[256],
                  UErrorCode& err) const;
 /**
 * Fills in the output parameter, subChars, with the substitution characters
 * as multiple bytes.
 * @param subChars the subsitution characters
 * @param len the number of bytes of the substitution character array
 * @param  err the error status code.  U_ILLEGAL_ARGUMENT_ERROR will be returned if
 * the converter is null.  If the substitution character array is too small, an
 * U_INDEX_OUTOFBOUNDS_ERROR will be returned.
 * @stable
 */
void getSubstitutionChars(char*         subChars,
                          int8_t&       len,
                          UErrorCode&    err) const;
/**
 * Sets the substitution chars when converting from unicode to a codepage. The
 * substitution is specified as a string of 1-4 bytes, and may contain null byte.
 * The fill-in parameter err will get the error status on return.
 * @param cstr the substitution character array to be set with
 * @param len the number of bytes of the substitution character array and upon return will contain the
 * number of bytes copied to that buffer
 * @param err the error status code.  U_ILLEGAL_ARGUMENT_ERROR if the converter is
 * null.   or if the number of bytes provided are not in the codepage's range (e.g length 1 for ucs-2)
 * @stable
 */
void setSubstitutionChars(const char*   subChars,
                          int8_t        len,
                          UErrorCode&    err);

/**
 * Resets the state of stateful conversion to the default state. This is used
 * in the case of error to restart a conversion from a known default state.
 * @stable
 */
void resetState(void);

/**
 * Gets the name of the converter (zero-terminated).
 * the name will be the internal name of the converter
 * @param converter the Unicode converter
 * @param err the error status code. U_INDEX_OUTOFBOUNDS_ERROR in the converterNameLen is too
 * small to contain the name.
 * @stable
 */
const char*  getName( UErrorCode&  err) const;


/**
 * Gets a codepage number associated with the converter. This is not guaranteed
 * to be the one used to create the converter. Some converters do not represent
 * IBM registered codepages and return zero for the codepage number.
 * The error code fill-in parameter indicates if the codepage number is available.
 * @param err the error status code.  U_ILLEGAL_ARGUMENT_ERROR will returned if
 * the converter is null or if converter's data table is null.
 * @return If any error occurrs, null will be returned.
 * @stable
 */
 int32_t  getCodepage(UErrorCode& err) const;

 /**
  * Returns the current setting action taken when a character from a codepage
  * is missing or a byte sequence is illegal etc.
  * @param action the callback function pointer
  * @param context the callback function state
  * @stable
  */
 void getMissingCharAction(UConverterToUCallback *action,
                           void **context) const;

/**
 * Return the current setting action taken when a unicode character is missing
 * or there is an unpaired surrogate etc.
 * @param action the callback function pointer
 * @param context the callback function state
 * @stable
 */
 void getMissingUnicodeAction(UConverterFromUCallback *action,
                              void **context) const;

 /**
  * Sets the current setting action taken when a character from a codepage is
  * missing. (Currently STOP or SUBSTITUTE).
  * @param newAction the action constant if an equivalent codepage character is missing
  * @param newContext the new toUnicode callback function state
  * @param oldAction the original action constant, saved for later restoration.
  * @param oldContext the old toUnicode callback function state
  * @param err the error status code
  * @stable
  */
 void  setMissingCharAction(UConverterToUCallback     newAction,
                void* newContext,
                UConverterToUCallback *oldAction, 
                void** oldContext,
                UErrorCode&            err);

/**
 * Sets the current setting action taken when a unicode character is missing.
 * (currently T_UnicodeConverter_MissingUnicodeAction is either STOP or SUBSTITUTE,
 *  SKIP, CLOSEST_MATCH, ESCAPE_SEQ may be added in the future).
 * @param newAction the action constant if an equivalent Unicode character is missing
 * @param newContext the new fromUnicode callback function state
 * @param oldAction the original action constant, saved for later restoration.
 * @param oldContext the old fromUnicode callback function state
 * @param err the error status code
 * @stable
 */
 void  setMissingUnicodeAction(UConverterFromUCallback  newAction,
                   void* newContext,
                   UConverterFromUCallback *oldAction,
                   void** oldContext,
                   UErrorCode&            err);
/**
 * Returns the localized name of the UnicodeConverter, if for any reason it is
 * available, the internal name will be returned instead.
 * @param displayLocale the valid Locale, from which we want to localize
 * @param displayString a UnicodeString that is going to be filled in.
 * @stable
 */
void getDisplayName(const Locale&   displayLocale,
                    UnicodeString&  displayName) const;

/**
 * Returns the T_UnicodeConverter_platform (ICU defined enum) of a UnicodeConverter
 * available, the internal name will be returned instead.
 * @param err the error code status
 * @return the codepages platform
 * @stable
 */
UConverterPlatform  getCodepagePlatform(UErrorCode& err) const;


 UnicodeConverter&   operator=(const UnicodeConverter& that);
 UBool              operator==(const UnicodeConverter& that) const;
 UBool              operator!=(const UnicodeConverter& that) const;
 UnicodeConverter(const UnicodeConverter&  that);

/**
 * Returns the available names. Lazy evaluated, Library owns the storage
 * @param num the number of available converters
 * @param err the error code status
 * @return the name array
 * @stable
 */
static  const char* const* getAvailableNames(int32_t&   num,
                         UErrorCode&  err);

/**
 * Iterates through every cached converter and frees all the unused ones
 * @return the number of cached converters successfully deleted
 * @stable
 */
static   int32_t flushCache(void);
/**
 * Fixes the backslash character mismapping.  For example, in SJIS, the backslash 
 * character in the ASCII portion is also used to represent the yen currency sign.  
 * When mapping from Unicode character 0x005C, it's unclear whether to map the 
 * character back to yen or backslash in SJIS.  This function will take the input
 * buffer and replace all the yen sign characters with backslash.  This is necessary
 * when the user tries to open a file with the input buffer on Windows.
 * @param source the input buffer to be fixed
 * @draft
 */
void fixFileSeparator(UnicodeString& source) const;

/**
 * Determines if the converter contains ambiguous mappings of the same
 * character or not.
 * @return TRUE if the converter contains ambiguous mapping of the same 
 * character, FALSE otherwise.
 * @draft
 */
UBool isAmbiguous(void) const;

};

typedef UnicodeConverter UnicodeConverterCPP;   /* Backwards compatibility. */

#endif