diff --git a/.gitattributes b/.gitattributes
index af9c124045..9931d13ee2 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -60,6 +60,7 @@ icu4c/source/data/in/ubidi.icu -text
icu4c/source/data/in/ucase.icu -text
icu4c/source/data/in/unames.icu -text
icu4c/source/data/in/uprops.icu -text
+icu4c/source/data/in/uts46.nrm -text
icu4c/source/data/lang/pool.res -text
icu4c/source/data/locales/pool.res -text
icu4c/source/data/region/pool.res -text
diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in
index ba712ee04a..dbe30ed5ae 100644
--- a/icu4c/source/common/Makefile.in
+++ b/icu4c/source/common/Makefile.in
@@ -91,7 +91,7 @@ utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_prop
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \
-uidna.o usprep.o punycode.o \
+uidna.o usprep.o uts46.o punycode.o \
util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o mutex.o dtintrv.o ucnvsel.o propsvec.o \
ulist.o uloc_tag.o icudataver.o icuplug.o
diff --git a/icu4c/source/common/common.vcproj b/icu4c/source/common/common.vcproj
index 98fd5d88c6..81d016f48a 100644
--- a/icu4c/source/common/common.vcproj
+++ b/icu4c/source/common/common.vcproj
@@ -2691,6 +2691,46 @@
Name="idna"
Filter="*.c,*.h"
>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -2743,6 +2783,10 @@
/>
+
+
+ * With nontransitional processing, such characters are
+ * copied to the destination string.
+ * With transitional processing, such characters are
+ * mapped (sharp s/sigma) or removed (joiner/nonjoiner).
+ *
+ * @return TRUE if transitional and nontransitional processing produce different results
+ * @draft ICU 4.6
+ */
+ UBool isTransitionalDifferent() const { return isTransDiff; }
+
+private:
+ friend class UTS46;
+
+ IDNAInfo(const IDNAInfo &other); // no copying
+ IDNAInfo &operator=(const IDNAInfo &other); // no copying
+
+ void reset() {
+ errors=labelErrors=0;
+ isTransDiff=FALSE;
+ isBiDi=FALSE;
+ isOkBiDi=TRUE;
+ }
+
+ uint32_t errors, labelErrors;
+ UBool isTransDiff;
+ UBool isBiDi;
+ UBool isOkBiDi;
+};
+
+U_NAMESPACE_END
+
+#endif // UCONFIG_NO_IDNA
+#endif // __IDNA_H__
diff --git a/icu4c/source/common/unicode/uidna.h b/icu4c/source/common/unicode/uidna.h
index 01c8c85d95..8be336d830 100644
--- a/icu4c/source/common/unicode/uidna.h
+++ b/icu4c/source/common/unicode/uidna.h
@@ -21,64 +21,488 @@
#if !UCONFIG_NO_IDNA
+#include "unicode/localpointer.h"
#include "unicode/parseerr.h"
-
+
/**
* \file
- * \brief C API: Internationalized Domain Names in Applications Tranformation
+ * \brief C API: Internationalizing Domain Names in Applications (IDNA)
*
- * UIDNA API implements the IDNA protocol as defined in the IDNA RFC
+ * IDNA2008 is implemented according to UTS #46, see the IDNA C++ class in idna.h.
+ *
+ * The C API functions which do take a UIDNA * service object pointer
+ * implement UTS #46 and IDNA2008.
+ * The C API functions which do not take a service object pointer
+ * implement IDNA2003.
+ */
+
+/*
+ * IDNA option bit set values.
+ */
+enum {
+ /**
+ * Default options value: None of the other options are set.
+ * @stable ICU 2.6
+ */
+ UIDNA_DEFAULT=0,
+ /**
+ * Option to allow unassigned code points in domain names and labels.
+ * This option is ignored by the UTS46 implementation.
+ * (UTS #46 disallows unassigned code points.)
+ * @stable ICU 2.6
+ */
+ UIDNA_ALLOW_UNASSIGNED=1,
+ /**
+ * Option to check whether the input conforms to the STD3 ASCII rules,
+ * for example the restriction of labels to LDH characters
+ * (ASCII Letters, Digits and Hyphen-Minus).
+ * @stable ICU 2.6
+ */
+ UIDNA_USE_STD3_RULES=2,
+ /**
+ * IDNA option to check for whether the input conforms to the BiDi rules.
+ * This option is ignored by the IDNA2003 implementation.
+ * (IDNA2003 always performs a BiDi check.)
+ * @draft ICU 4.6
+ */
+ UIDNA_CHECK_BIDI=4,
+ /**
+ * IDNA option to check for whether the input conforms to the CONTEXTJ rules.
+ * This option is ignored by the IDNA2003 implementation.
+ * (The CONTEXTJ check is new in IDNA2008.)
+ * @draft ICU 4.6
+ */
+ UIDNA_CHECK_CONTEXTJ=8,
+ /**
+ * IDNA option for nontransitional processing in ToASCII().
+ * By default, ToASCII() uses transitional processing.
+ * This option is ignored by the IDNA2003 implementation.
+ * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
+ * @draft ICU 4.6
+ */
+ UIDNA_NONTRANSITIONAL_TO_ASCII=0x10,
+ /**
+ * IDNA option for nontransitional processing in ToUnicode().
+ * By default, ToUnicode() uses transitional processing.
+ * This option is ignored by the IDNA2003 implementation.
+ * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
+ * @draft ICU 4.6
+ */
+ UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20
+};
+
+/**
+ * Opaque C service object type for the new IDNA API.
+ * @draft ICU 4.6
+ */
+struct UIDNA;
+typedef struct UIDNA UIDNA; /**< C typedef for struct UIDNA. @draft ICU 4.6 */
+
+/**
+ * Returns a UIDNA instance which implements UTS #46.
+ * Returns an unmodifiable instance, owned by the caller.
+ * Cache it for multiple operations, and uidna_close() it when done.
+ *
+ * For details about the UTS #46 implementation see the IDNA C++ class in idna.h.
+ *
+ * @param options Bit set to modify the processing and error checking.
+ * See option bit set values in uidna.h.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return the UTS #46 UIDNA instance, if successful
+ * @draft ICU 4.6
+ */
+U_DRAFT UIDNA * U_EXPORT2
+uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode);
+
+/**
+ * Closes a UIDNA instance.
+ * @param idna UIDNA instance to be closed
+ * @draft ICU 4.6
+ */
+U_DRAFT void U_EXPORT2
+uidna_close(UIDNA *idna);
+
+#if U_SHOW_CPLUSPLUS_API
+
+U_NAMESPACE_BEGIN
+
+/**
+ * \class LocalUIDNAPointer
+ * "Smart pointer" class, closes a UIDNA via uidna_close().
+ * For most methods see the LocalPointerBase base class.
+ *
+ * @see LocalPointerBase
+ * @see LocalPointer
+ * @draft ICU 4.6
+ */
+U_DEFINE_LOCAL_OPEN_POINTER(LocalUIDNAPointer, UIDNA, uidna_close);
+
+U_NAMESPACE_END
+
+#endif
+
+/**
+ * Output container for IDNA processing errors.
+ * Initialize with UIDNA_INFO_INITIALIZER:
+ * \code
+ * UIDNAInfo info = UIDNA_INFO_INITIALIZER;
+ * int32_t length = uidna_nameToASCII(..., &info, &errorCode);
+ * if(U_SUCCESS(errorCode) && info.errors!=0) { ... }
+ * \endcode
+ * @draft ICU 4.6
+ */
+struct UIDNAInfo {
+ /** sizeof(UIDNAInfo) @draft ICU 4.6 */
+ int16_t size;
+ /**
+ * Set to TRUE if transitional and nontransitional processing produce different results.
+ * For details see C++ IDNAInfo::isTransitionalDifferent().
+ * @draft ICU 4.6
+ */
+ UBool isTransitionalDifferent;
+ UBool reservedB3; /**< Reserved field, do not use. @internal */
+ /**
+ * Bit set indicating IDNA processing errors. 0 if no errors.
+ * See UIDNA_ERROR_... constants.
+ * @draft ICU 4.6
+ */
+ uint32_t errors;
+ int32_t reservedI2; /**< Reserved field, do not use. @internal */
+ int32_t reservedI3; /**< Reserved field, do not use. @internal */
+};
+typedef struct UIDNAInfo UIDNAInfo;
+
+/**
+ * Static initializer for a UIDNAInfo struct.
+ * @draft ICU 4.6
+ */
+#define UIDNA_INFO_INITIALIZER { \
+ (int16_t)sizeof(UIDNAInfo), \
+ FALSE, FALSE, \
+ 0, 0, 0 }
+
+/**
+ * Converts a single domain name label into its ASCII form for DNS lookup.
+ * If any processing step fails, then pInfo->errors will be non-zero and
+ * the result might not be an ASCII string.
+ * The label might be modified according to the types of errors.
+ * Labels with severe errors will be left in (or turned into) their Unicode form.
+ *
+ * The UErrorCode indicates an error only in exceptional cases,
+ * such as a U_MEMORY_ALLOCATION_ERROR.
+ *
+ * @param idna UIDNA instance
+ * @param label Input domain name label
+ * @param length Label length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToASCII(const UIDNA *idna,
+ const UChar *label, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/**
+ * Converts a single domain name label into its Unicode form for human-readable display.
+ * If any processing step fails, then pInfo->errors will be non-zero.
+ * The domain name might be modified according to the types of errors.
+ *
+ * The UErrorCode indicates an error only in exceptional cases,
+ * such as a U_MEMORY_ALLOCATION_ERROR.
+ *
+ * @param idna UIDNA instance
+ * @param label Input domain name label
+ * @param length Label length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToUnicode(const UIDNA *idna,
+ const UChar *label, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/**
+ * Converts a whole domain name into its ASCII form for DNS lookup.
+ * If any processing step fails, then pInfo->errors will be non-zero and
+ * the result might not be an ASCII string.
+ * The domain name might be modified according to the types of errors.
+ * Labels with severe errors will be left in (or turned into) their Unicode form.
+ *
+ * The UErrorCode indicates an error only in exceptional cases,
+ * such as a U_MEMORY_ALLOCATION_ERROR.
+ *
+ * @param idna UIDNA instance
+ * @param name Input domain name
+ * @param length Domain name length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToASCII(const UIDNA *idna,
+ const UChar *name, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/**
+ * Converts a whole domain name into its Unicode form for human-readable display.
+ * If any processing step fails, then pInfo->errors will be non-zero.
+ * The domain name might be modified according to the types of errors.
+ *
+ * The UErrorCode indicates an error only in exceptional cases,
+ * such as a U_MEMORY_ALLOCATION_ERROR.
+ *
+ * @param idna UIDNA instance
+ * @param name Input domain name
+ * @param length Domain name length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToUnicode(const UIDNA *idna,
+ const UChar *name, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/* UTF-8 versions of the processing methods --------------------------------- */
+
+/**
+ * Converts a single domain name label into its ASCII form for DNS lookup.
+ * UTF-8 version of uidna_labelToASCII(), same behavior.
+ *
+ * @param idna UIDNA instance
+ * @param label Input domain name label
+ * @param length Label length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToASCII_UTF8(const UIDNA *idna,
+ const char *label, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/**
+ * Converts a single domain name label into its Unicode form for human-readable display.
+ * UTF-8 version of uidna_labelToUnicode(), same behavior.
+ *
+ * @param idna UIDNA instance
+ * @param label Input domain name label
+ * @param length Label length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToUnicodeUTF8(const UIDNA *idna,
+ const char *label, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/**
+ * Converts a whole domain name into its ASCII form for DNS lookup.
+ * UTF-8 version of uidna_nameToASCII(), same behavior.
+ *
+ * @param idna UIDNA instance
+ * @param name Input domain name
+ * @param length Domain name length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToASCII_UTF8(const UIDNA *idna,
+ const char *name, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/**
+ * Converts a whole domain name into its Unicode form for human-readable display.
+ * UTF-8 version of uidna_nameToUnicode(), same behavior.
+ *
+ * @param idna UIDNA instance
+ * @param name Input domain name
+ * @param length Domain name length, or -1 if NUL-terminated
+ * @param dest Destination string buffer
+ * @param capacity Destination buffer capacity
+ * @param pInfo Output container of IDNA processing details.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return destination string length
+ * @draft ICU 4.6
+ */
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToUnicodeUTF8(const UIDNA *idna,
+ const char *name, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode);
+
+/*
+ * IDNA error bit set values.
+ * When a domain name or label fails a processing step or does not meet the
+ * validity criteria, then one or more of these error bits are set.
+ */
+enum {
+ /**
+ * A non-final domain name label (or the whole domain name) is empty.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_EMPTY_LABEL=1,
+ /**
+ * A domain name label is longer than 63 bytes.
+ * (See STD13/RFC1034 3.1. Name space specifications and terminology.)
+ * This is only checked in ToASCII operations, and only if the UIDNA_USE_STD3_RULES is set.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_LABEL_TOO_LONG=2,
+ /**
+ * A domain name is longer than 255 bytes in its storage form.
+ * (See STD13/RFC1034 3.1. Name space specifications and terminology.)
+ * This is only checked in ToASCII operations, and only if the UIDNA_USE_STD3_RULES is set.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_DOMAIN_NAME_TOO_LONG=4,
+ /**
+ * A label starts with a hyphen-minus ('-').
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_LEADING_HYPHEN=8,
+ /**
+ * A label ends with a hyphen-minus ('-').
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_TRAILING_HYPHEN=0x10,
+ /**
+ * A label contains hyphen-minus ('-') in the third and fourth positions.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_HYPHEN_3_4=0x20,
+ /**
+ * A label starts with a combining mark.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_LEADING_COMBINING_MARK=0x40,
+ /**
+ * A label or domain name contains disallowed characters.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_DISALLOWED=0x80,
+ /**
+ * A label starts with "xn--" but does not contain valid Punycode.
+ * That is, an xn-- label failed Punycode decoding.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_PUNYCODE=0x100,
+ /**
+ * A label contains a dot=full stop.
+ * This can occur in an input string for a single-label function.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_LABEL_HAS_DOT=0x200,
+ /**
+ * An ACE label does not contain a valid label string.
+ * The label was successfully ACE (Punycode) decoded but the resulting
+ * string had severe validation errors. For example,
+ * it might contain characters that are not allowed in ACE labels,
+ * or it might not be normalized.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_INVALID_ACE_LABEL=0x400,
+ /**
+ * A label does not meet the IDNA BiDi requirements (for right-to-left characters).
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_BIDI=0x800,
+ /**
+ * A label does not meet the IDNA CONTEXTJ requirements.
+ * @draft ICU 4.6
+ */
+ UIDNA_ERROR_CONTEXTJ=0x1000
+};
+
+/* IDNA2003 API ------------------------------------------------------------- */
+
+/**
+ * IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC.
+ * This operation is done on single labels before sending it to something that expects
+ * ASCII names. A label is an individual part of a domain name. Labels are usually
+ * separated by dots; e.g. "www.example.com" is composed of 3 labels "www","example", and "com".
+ *
+ * IDNA2003 API Overview:
+ *
+ * The uidna_ API implements the IDNA protocol as defined in the IDNA RFC
* (http://www.ietf.org/rfc/rfc3490.txt).
- * The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels
- * containing non-ASCII code points are required to be processed by
+ * The RFC defines 2 operations: ToASCII and ToUnicode. Domain name labels
+ * containing non-ASCII code points are processed by the
* ToASCII operation before passing it to resolver libraries. Domain names
- * that are obtained from resolver libraries are required to be processed by
+ * that are obtained from resolver libraries are processed by the
* ToUnicode operation before displaying the domain name to the user.
* IDNA requires that implementations process input strings with Nameprep
- * (http://www.ietf.org/rfc/rfc3491.txt),
- * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
- * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt).
- * Implementations of IDNA MUST fully implement Nameprep and Punycode;
+ * (http://www.ietf.org/rfc/rfc3491.txt),
+ * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
+ * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt).
+ * Implementations of IDNA MUST fully implement Nameprep and Punycode;
* neither Nameprep nor Punycode are optional.
- * The input and output of ToASCII and ToUnicode operations are Unicode
+ * The input and output of ToASCII and ToUnicode operations are Unicode
* and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
* multiple times to an input string will yield the same result as applying the operation
* once.
* ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
* ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
*
- */
-
-/**
- * Option to prohibit processing of unassigned codepoints in the input and
- * do not check if the input conforms to STD-3 ASCII rules.
- *
- * @see uidna_toASCII uidna_toUnicode
- * @stable ICU 2.6
- */
-#define UIDNA_DEFAULT 0x0000
-/**
- * Option to allow processing of unassigned codepoints in the input
- *
- * @see uidna_toASCII uidna_toUnicode
- * @stable ICU 2.6
- */
-#define UIDNA_ALLOW_UNASSIGNED 0x0001
-/**
- * Option to check if input conforms to STD-3 ASCII rules
- *
- * @see uidna_toASCII uidna_toUnicode
- * @stable ICU 2.6
- */
-#define UIDNA_USE_STD3_RULES 0x0002
-
-/**
- * This function implements the ToASCII operation as defined in the IDNA RFC.
- * This operation is done on single labels before sending it to something that expects
- * ASCII names. A label is an individual part of a domain name. Labels are usually
- * separated by dots; e.g. "www.example.com" is composed of 3 labels "www","example", and "com".
- *
- *
* @param src Input UChar array containing label in Unicode.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array with ASCII (ACE encoded) label.
@@ -93,7 +517,7 @@
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
- *
+ *
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
@@ -119,7 +543,7 @@ uidna_toASCII(const UChar* src, int32_t srcLength,
/**
- * This function implements the ToUnicode operation as defined in the IDNA RFC.
+ * IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC.
* This operation is done on single labels before sending it to something that expects
* Unicode names. A label is an individual part of a domain name. Labels are usually
* separated by dots; for e.g. "www.example.com" is composed of 3 labels "www","example", and "com".
@@ -129,7 +553,7 @@ uidna_toASCII(const UChar* src, int32_t srcLength,
* @param dest Output Converted UChar array containing Unicode equivalent of label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
- *
+ *
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
@@ -142,8 +566,6 @@ uidna_toASCII(const UChar* src, int32_t srcLength,
* verification of decoded ACE input by applying toASCII and comparing
* its output with source
*
- *
- *
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
@@ -169,12 +591,12 @@ uidna_toUnicode(const UChar* src, int32_t srcLength,
/**
- * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
+ * IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
* This operation is done on complete domain names, e.g: "www.example.com".
* It is important to note that this operation can fail. If it fails, then the input
* domain name cannot be used as an Internationalized Domain Name and the application
* should have methods defined to deal with the failure.
- *
+ *
* Note: IDNA RFC specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
@@ -185,7 +607,7 @@ uidna_toUnicode(const UChar* src, int32_t srcLength,
* @param dest Output UChar array with ASCII (ACE encoded) IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
- *
+ *
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
@@ -194,11 +616,11 @@ uidna_toUnicode(const UChar* src, int32_t srcLength,
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
- *
+ *
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
- *
+ *
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
@@ -219,7 +641,7 @@ uidna_IDNToASCII( const UChar* src, int32_t srcLength,
UErrorCode* status);
/**
- * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
+ * IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
* This operation is done on complete domain names, e.g: "www.example.com".
*
* Note: IDNA RFC specifies that a conformant application should divide a domain name
@@ -232,7 +654,7 @@ uidna_IDNToASCII( const UChar* src, int32_t srcLength,
* @param dest Output UChar array containing Unicode equivalent of source IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
- *
+ *
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
@@ -241,7 +663,7 @@ uidna_IDNToASCII( const UChar* src, int32_t srcLength,
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
- *
+ *
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
@@ -266,7 +688,7 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
UErrorCode* status);
/**
- * Compare two IDN strings for equivalence.
+ * IDNA2003: Compare two IDN strings for equivalence.
* This function splits the domain names into labels and compares them.
* According to IDN RFC, whenever two labels are compared, they are
* considered equal if and only if their ASCII forms (obtained by
@@ -280,7 +702,7 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
* @param s2 Second source string.
* @param length2 Length of second source string, or -1 if NUL-terminated.
* @param options A bit set of options:
- *
+ *
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
@@ -289,7 +711,7 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
- *
+ *
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h
index aba4c45a92..d8a399ff0f 100644
--- a/icu4c/source/common/unicode/unistr.h
+++ b/icu4c/source/common/unicode/unistr.h
@@ -1600,6 +1600,7 @@ public:
* Calls u_strToUTF8WithSub().
*
* @param sink A ByteSink to which the UTF-8 version of the string is written.
+ * sink.Flush() is called at the end.
* @stable ICU 4.2
* @see toUTF8String
*/
diff --git a/icu4c/source/common/unicode/unorm2.h b/icu4c/source/common/unicode/unorm2.h
index 25af203b19..8c6c5f91d9 100644
--- a/icu4c/source/common/unicode/unorm2.h
+++ b/icu4c/source/common/unicode/unorm2.h
@@ -89,17 +89,17 @@ typedef enum {
* @stable ICU 2.0
*/
typedef enum UNormalizationCheckResult {
- /**
+ /**
* The input string is not in the normalization form.
* @stable ICU 2.0
*/
UNORM_NO,
- /**
+ /**
* The input string is in the normalization form.
* @stable ICU 2.0
*/
UNORM_YES,
- /**
+ /**
* The input string may or may not be in the normalization form.
* This value is only returned for composition forms like NFC and FCC,
* when a backward-combining character is found for which the surrounding text
@@ -151,7 +151,7 @@ unorm2_getInstance(const char *packageName,
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
- * @param norm2 wrapped Normalizer2 instance
+ * @param norm2 wrapped UNormalizer2 instance
* @param filterSet USet which determines the characters to be normalized
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp
index d5b2e87f55..ace989bf9a 100644
--- a/icu4c/source/common/unistr.cpp
+++ b/icu4c/source/common/unistr.cpp
@@ -867,6 +867,7 @@ UnicodeString::toUTF8(ByteSink &sink) const {
}
if(U_SUCCESS(errorCode)) {
sink.Append(utf8, length8);
+ sink.Flush();
}
if(utf8IsOwned) {
uprv_free(utf8);
diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp
new file mode 100644
index 0000000000..bbfc5e6c94
--- /dev/null
+++ b/icu4c/source/common/uts46.cpp
@@ -0,0 +1,1333 @@
+/*
+*******************************************************************************
+* Copyright (C) 2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* file name: uts46.cpp
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2010mar09
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_IDNA
+
+#include "unicode/idna.h"
+#include "unicode/normalizer2.h"
+#include "unicode/ustring.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "punycode.h"
+#include "ustr_imp.h"
+
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
+// Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
+//
+// The domain name length limit is 255 octets in an internal DNS representation
+// where the last ("root") label is the empty label
+// represented by length byte 0 alone.
+// In a conventional string, this translates to 253 characters, or 254
+// if there is a trailing dot for the root label.
+
+U_NAMESPACE_BEGIN
+
+// Severe errors which usually result in a U+FFFD replacement character in the result string.
+const uint32_t severeErrors=
+ UIDNA_ERROR_LEADING_COMBINING_MARK|
+ UIDNA_ERROR_DISALLOWED|
+ UIDNA_ERROR_PUNYCODE|
+ UIDNA_ERROR_LABEL_HAS_DOT|
+ UIDNA_ERROR_INVALID_ACE_LABEL;
+
+static inline UBool
+isASCIIString(const UnicodeString &dest) {
+ const UChar *s=dest.getBuffer();
+ const UChar *limit=s+dest.length();
+ while(s0x7f) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
+static UBool
+isASCIIOkBiDi(const UChar *s, int32_t length);
+
+static UBool
+isASCIIOkBiDi(const char *s, int32_t length);
+
+// IDNA class default implementations -------------------------------------- ***
+
+void
+IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ if(U_SUCCESS(errorCode)) {
+ UnicodeString destString;
+ labelToASCII(UnicodeString::fromUTF8(label), destString,
+ info, errorCode).toUTF8(dest);
+ }
+}
+
+void
+IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ if(U_SUCCESS(errorCode)) {
+ UnicodeString destString;
+ labelToUnicode(UnicodeString::fromUTF8(label), destString,
+ info, errorCode).toUTF8(dest);
+ }
+}
+
+void
+IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ if(U_SUCCESS(errorCode)) {
+ UnicodeString destString;
+ nameToASCII(UnicodeString::fromUTF8(name), destString,
+ info, errorCode).toUTF8(dest);
+ }
+}
+
+void
+IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ if(U_SUCCESS(errorCode)) {
+ UnicodeString destString;
+ nameToUnicode(UnicodeString::fromUTF8(name), destString,
+ info, errorCode).toUTF8(dest);
+ }
+}
+
+UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(IDNA)
+
+// UTS46 class declaration ------------------------------------------------- ***
+
+class UTS46 : public IDNA {
+public:
+ UTS46(uint32_t options, UErrorCode &errorCode);
+ virtual ~UTS46();
+
+ virtual UnicodeString &
+ labelToASCII(const UnicodeString &label, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ virtual UnicodeString &
+ labelToUnicode(const UnicodeString &label, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ virtual UnicodeString &
+ nameToASCII(const UnicodeString &name, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ virtual UnicodeString &
+ nameToUnicode(const UnicodeString &name, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ virtual void
+ labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ virtual void
+ labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ virtual void
+ nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ virtual void
+ nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+private:
+ UnicodeString &
+ process(const UnicodeString &src,
+ UBool isLabel, UBool toASCII,
+ UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ void
+ processUTF8(const StringPiece &src,
+ UBool isLabel, UBool toASCII,
+ ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ UnicodeString &
+ processUnicode(const UnicodeString &src,
+ int32_t labelStart, int32_t mappingStart,
+ UBool isLabel, UBool toASCII,
+ UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+
+ // returns the new dest.length()
+ int32_t
+ mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
+ UErrorCode &errorCode) const;
+
+ // returns the new label length
+ int32_t
+ processLabel(UnicodeString &dest,
+ int32_t labelStart, int32_t labelLength,
+ UBool toASCII,
+ IDNAInfo &info, UErrorCode &errorCode) const;
+ int32_t
+ markBadACELabel(UnicodeString &dest,
+ int32_t labelStart, int32_t labelLength,
+ UBool toASCII, IDNAInfo &info) const;
+
+ void
+ checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
+
+ UBool
+ isLabelOkContextJ(const UChar *label, int32_t labelLength) const;
+
+ const Normalizer2 &uts46Norm2; // uts46.nrm
+ uint32_t options;
+};
+
+IDNA *
+IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
+ if(U_SUCCESS(errorCode)) {
+ IDNA *idna=new UTS46(options, errorCode);
+ if(idna==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ } else if(U_FAILURE(errorCode)) {
+ delete idna;
+ idna=NULL;
+ }
+ return idna;
+ } else {
+ return NULL;
+ }
+}
+
+// UTS46 implementation ---------------------------------------------------- ***
+
+UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
+ : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)),
+ options(opt) {}
+
+UTS46::~UTS46() {}
+
+UnicodeString &
+UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ return process(label, TRUE, TRUE, dest, info, errorCode);
+}
+
+UnicodeString &
+UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ return process(label, TRUE, FALSE, dest, info, errorCode);
+}
+
+UnicodeString &
+UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ process(name, FALSE, TRUE, dest, info, errorCode);
+ if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&
+ isASCIIString(dest) &&
+ (dest.length()>254 || dest[253]!=0x2e)
+ ) {
+ info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
+ }
+ return dest;
+}
+
+UnicodeString &
+UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ return process(name, FALSE, FALSE, dest, info, errorCode);
+}
+
+void
+UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ processUTF8(label, TRUE, TRUE, dest, info, errorCode);
+}
+
+void
+UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ processUTF8(label, TRUE, FALSE, dest, info, errorCode);
+}
+
+void
+UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ processUTF8(name, FALSE, TRUE, dest, info, errorCode);
+}
+
+void
+UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ processUTF8(name, FALSE, FALSE, dest, info, errorCode);
+}
+
+// UTS #46 data for ASCII characters.
+// The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
+// and passes through all other ASCII characters.
+// If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
+// using this data.
+// The ASCII fastpath also uses this data.
+// Values: -1=disallowed 0==valid 1==mapped (lowercase)
+static const int8_t asciiData[128]={
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ // 002D..002E; valid # HYPHEN-MINUS..FULL STOP
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1,
+ // 0030..0039; valid # DIGIT ZERO..DIGIT NINE
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
+ // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+ -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
+ // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+ -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1
+};
+
+UnicodeString &
+UTS46::process(const UnicodeString &src,
+ UBool isLabel, UBool toASCII,
+ UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ // uts46Norm2.normalize() would do all of this error checking and setup,
+ // but with the ASCII fastpath we do not always call it, and do not
+ // call it first.
+ if(U_FAILURE(errorCode)) {
+ dest.setToBogus();
+ return dest;
+ }
+ const UChar *srcArray=src.getBuffer();
+ if(&dest==&src || srcArray==NULL) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ dest.setToBogus();
+ return dest;
+ }
+ // Arguments are fine, reset output values.
+ dest.remove();
+ info.reset();
+ int32_t srcLength=src.length();
+ if(srcLength==0) {
+ if(toASCII) {
+ info.errors|=UIDNA_ERROR_EMPTY_LABEL;
+ }
+ return dest;
+ }
+ UChar *destArray=dest.getBuffer(srcLength);
+ if(destArray==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return dest;
+ }
+ // ASCII fastpath
+ UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
+ int32_t labelStart=0;
+ int32_t i;
+ for(i=0;; ++i) {
+ if(i==srcLength) {
+ if(toASCII) {
+ if((i-labelStart)>63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ // There is a trailing dot if labelStart==i.
+ if(!isLabel && i>=254 && (i>254 || labelStart0x7f) {
+ break;
+ }
+ int cData=asciiData[c];
+ if(cData>0) {
+ destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.
+ } else if(cData<0 && disallowNonLDHDot) {
+ break; // Replacing with U+FFFD can be complicated for toASCII.
+ } else {
+ destArray[i]=c;
+ if(c==0x2d) { // hyphen
+ if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
+ // "??--..." is Punycode or forbidden.
+ break;
+ }
+ if(i==labelStart) {
+ // label starts with "-"
+ info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
+ }
+ if((i+1)==srcLength || srcArray[i+1]==0x2e) {
+ // label ends with "-"
+ info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
+ }
+ } else if(c==0x2e) { // dot
+ if(isLabel) {
+ break; // Replacing with U+FFFD can be complicated for toASCII.
+ }
+ if(toASCII) {
+ // Permit an empty label at the end but not elsewhere.
+ if(i==labelStart && i<(srcLength-1)) {
+ info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
+ } else if((i-labelStart)>63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ }
+ info.errors|=info.labelErrors;
+ info.labelErrors=0;
+ labelStart=i+1;
+ }
+ }
+ }
+ info.errors|=info.labelErrors;
+ dest.releaseBuffer(i);
+ processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
+ if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
+ (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
+ ) {
+ info.errors|=UIDNA_ERROR_BIDI;
+ }
+ return dest;
+}
+
+void
+UTS46::processUTF8(const StringPiece &src,
+ UBool isLabel, UBool toASCII,
+ ByteSink &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) {
+ return;
+ }
+ const char *srcArray=src.data();
+ int32_t srcLength=src.length();
+ if(srcArray==NULL && srcLength!=0) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ // Arguments are fine, reset output values.
+ info.reset();
+ if(srcLength==0) {
+ if(toASCII) {
+ info.errors|=UIDNA_ERROR_EMPTY_LABEL;
+ }
+ dest.Flush();
+ return;
+ }
+ UnicodeString destString;
+ int32_t labelStart=0;
+ if(srcLength<=256) { // length of stackArray[]
+ // ASCII fastpath
+ char stackArray[256];
+ int32_t destCapacity;
+ char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,
+ stackArray, LENGTHOF(stackArray), &destCapacity);
+ UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
+ int32_t i;
+ for(i=0;; ++i) {
+ if(i==srcLength) {
+ if(toASCII) {
+ if((i-labelStart)>63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ // There is a trailing dot if labelStart==i.
+ if(!isLabel && i>=254 && (i>254 || labelStart0x7f
+ break;
+ }
+ int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char.
+ if(cData>0) {
+ destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.
+ } else if(cData<0 && disallowNonLDHDot) {
+ break; // Replacing with U+FFFD can be complicated for toASCII.
+ } else {
+ destArray[i]=c;
+ if(c==0x2d) { // hyphen
+ if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
+ // "??--..." is Punycode or forbidden.
+ break;
+ }
+ if(i==labelStart) {
+ // label starts with "-"
+ info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
+ }
+ if((i+1)==srcLength || srcArray[i+1]==0x2e) {
+ // label ends with "-"
+ info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
+ }
+ } else if(c==0x2e) { // dot
+ if(isLabel) {
+ break; // Replacing with U+FFFD can be complicated for toASCII.
+ }
+ if(toASCII) {
+ // Permit an empty label at the end but not elsewhere.
+ if(i==labelStart && i<(srcLength-1)) {
+ info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
+ } else if((i-labelStart)>63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ }
+ info.errors|=info.labelErrors;
+ info.labelErrors=0;
+ labelStart=i+1;
+ }
+ }
+ }
+ info.errors|=info.labelErrors;
+ // Convert the processed ASCII prefix of the current label to UTF-16.
+ int32_t mappingStart=i-labelStart;
+ destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));
+ // Output the previous ASCII labels and process the rest of src in UTF-16.
+ dest.Append(destArray, labelStart);
+ processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,
+ isLabel, toASCII,
+ destString, info, errorCode);
+ } else {
+ // src is too long for the ASCII fastpath implementation.
+ processUnicode(UnicodeString::fromUTF8(src), 0, 0,
+ isLabel, toASCII,
+ destString, info, errorCode);
+ }
+ destString.toUTF8(dest); // calls dest.Flush()
+ if(toASCII && !isLabel) {
+ // length==labelStart==254 means that there is a trailing dot (ok) and
+ // destString is empty (do not index at 253-labelStart).
+ int32_t length=labelStart+destString.length();
+ if( length>=254 && isASCIIString(destString) &&
+ (length>254 ||
+ (labelStart<254 && destString[253-labelStart]!=0x2e))
+ ) {
+ info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
+ }
+ }
+ if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
+ (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))
+ ) {
+ info.errors|=UIDNA_ERROR_BIDI;
+ }
+}
+
+UnicodeString &
+UTS46::processUnicode(const UnicodeString &src,
+ int32_t labelStart, int32_t mappingStart,
+ UBool isLabel, UBool toASCII,
+ UnicodeString &dest,
+ IDNAInfo &info, UErrorCode &errorCode) const {
+ if(mappingStart==0) {
+ uts46Norm2.normalize(src, dest, errorCode);
+ } else {
+ uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
+ }
+ if(U_FAILURE(errorCode)) {
+ return dest;
+ }
+ UBool doMapDevChars=
+ toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :
+ (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;
+ const UChar *destArray=dest.getBuffer();
+ int32_t destLength=dest.length();
+ int32_t labelLimit=labelStart;
+ while(labelLimit=0x200c)) {
+ info.isTransDiff=TRUE;
+ if(doMapDevChars) {
+ destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
+ if(U_FAILURE(errorCode)) {
+ return dest;
+ }
+ destArray=dest.getBuffer();
+ // Do not increment labelLimit in case c was removed.
+ // All deviation characters have been mapped, no need to check for them again.
+ doMapDevChars=FALSE;
+ } else {
+ ++labelLimit;
+ }
+ } else {
+ ++labelLimit;
+ }
+ }
+ // Permit an empty label at the end (0=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
+ // Label starts with "xn--", try to un-Punycode it.
+ wasPunycode=TRUE;
+ UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit
+ if(unicodeBuffer==NULL) {
+ // Should never occur if we used capacity==-1 which uses the internal buffer.
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return labelLength;
+ }
+ UErrorCode punycodeErrorCode=U_ZERO_ERROR;
+ int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,
+ unicodeBuffer, fromPunycode.getCapacity(),
+ NULL, &punycodeErrorCode);
+ if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+ fromPunycode.releaseBuffer(0);
+ unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
+ if(unicodeBuffer==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return labelLength;
+ }
+ punycodeErrorCode=U_ZERO_ERROR;
+ unicodeLength=u_strFromPunycode(label+4, labelLength-4,
+ unicodeBuffer, fromPunycode.getCapacity(),
+ NULL, &punycodeErrorCode);
+ }
+ fromPunycode.releaseBuffer(unicodeLength);
+ if(U_FAILURE(punycodeErrorCode)) {
+ info.labelErrors|=UIDNA_ERROR_PUNYCODE;
+ return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
+ }
+ // Check for NFC, and for characters that are not
+ // valid or deviation characters according to the normalizer.
+ // If there is something wrong, then the string will change.
+ // Note that the normalizer passes through non-LDH ASCII and deviation characters.
+ // Deviation characters are ok in Punycode even in transitional processing.
+ // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
+ // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
+ UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
+ if(U_FAILURE(errorCode)) {
+ return labelLength;
+ }
+ if(!isValid) {
+ info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
+ return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
+ }
+ labelString=&fromPunycode;
+ label=fromPunycode.getBuffer();
+ labelStart=0;
+ labelLength=fromPunycode.length();
+ } else {
+ wasPunycode=FALSE;
+ labelString=&dest;
+ }
+ // Validity check
+ if(labelLength==0) {
+ if(toASCII) {
+ info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
+ }
+ return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
+ }
+ // labelLength>0
+ if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {
+ // label starts with "??--"
+ info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;
+ }
+ if(label[0]==0x2d) {
+ // label starts with "-"
+ info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
+ }
+ if(label[labelLength-1]==0x2d) {
+ // label ends with "-"
+ info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
+ }
+ // If the label was not a Punycode label, then it was the result of
+ // mapping, normalization and label segmentation.
+ // If the label was in Punycode, then we mapped it again above
+ // and checked its validity.
+ // Now we handle the STD3 restriction to LDH characters (if set)
+ // and we look for U+FFFD which indicates disallowed characters
+ // in a non-Punycode label or U+FFFD itself in a Punycode label.
+ // We also check for dots which can come from the input to a single-label function.
+ // Ok to cast away const because we own the UnicodeString.
+ UChar *s=(UChar *)label;
+ const UChar *limit=label+labelLength;
+ UChar oredChars=0;
+ // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
+ UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
+ do {
+ UChar c=*s;
+ if(c<=0x7f) {
+ if(c==0x2e) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
+ *s=0xfffd;
+ } else if(disallowNonLDHDot && asciiData[c]<0) {
+ info.labelErrors|=UIDNA_ERROR_DISALLOWED;
+ *s=0xfffd;
+ }
+ } else {
+ oredChars|=c;
+ if(c==0xfffd) {
+ info.labelErrors|=UIDNA_ERROR_DISALLOWED;
+ ++s;
+ }
+ }
+ ++s;
+ } while(sreplace(labelStart, cpLength, (UChar)0xfffd);
+ label=labelString->getBuffer()+labelStart;
+ labelLength+=1-cpLength;
+ if(labelString==&dest) {
+ destLabelLength=labelLength;
+ }
+ }
+ if((info.labelErrors&severeErrors)==0) {
+ // Do contextual checks only if we do not have U+FFFD from a severe error
+ // because U+FFFD can make these checks fail.
+ if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {
+ checkLabelBiDi(label, labelLength, info);
+ }
+ if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
+ !isLabelOkContextJ(label, labelLength)
+ ) {
+ info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
+ }
+ if(toASCII) {
+ if(wasPunycode) {
+ // Leave a Punycode label unchanged if it has no severe errors.
+ if(destLabelLength>63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ return destLabelLength;
+ } else if(oredChars>=0x80) {
+ // Contains non-ASCII characters.
+ UnicodeString punycode;
+ UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length
+ if(buffer==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return destLabelLength;
+ }
+ buffer[0]=0x78; // Write "xn--".
+ buffer[1]=0x6e;
+ buffer[2]=0x2d;
+ buffer[3]=0x2d;
+ int32_t punycodeLength=u_strToPunycode(label, labelLength,
+ buffer+4, punycode.getCapacity()-4,
+ NULL, &errorCode);
+ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+ errorCode=U_ZERO_ERROR;
+ punycode.releaseBuffer(4);
+ buffer=punycode.getBuffer(4+punycodeLength);
+ if(buffer==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return destLabelLength;
+ }
+ punycodeLength=u_strToPunycode(label, labelLength,
+ buffer+4, punycode.getCapacity()-4,
+ NULL, &errorCode);
+ }
+ punycodeLength+=4;
+ punycode.releaseBuffer(punycodeLength);
+ if(U_FAILURE(errorCode)) {
+ return destLabelLength;
+ }
+ if(punycodeLength>63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ return replaceLabel(dest, destLabelStart, destLabelLength,
+ punycode, punycodeLength);
+ } else {
+ // all-ASCII label
+ if(labelLength>63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ }
+ }
+ } else {
+ // If a Punycode label has severe errors,
+ // then leave it but make sure it does not look valid.
+ if(wasPunycode) {
+ info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
+ return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
+ }
+ }
+ return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
+}
+
+// Make sure an ACE label does not look valid.
+// Append U+FFFD if the label has only LDH characters.
+// If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
+int32_t
+UTS46::markBadACELabel(UnicodeString &dest,
+ int32_t labelStart, int32_t labelLength,
+ UBool toASCII, IDNAInfo &info) const {
+ UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
+ UBool isASCII=TRUE;
+ UBool onlyLDH=TRUE;
+ const UChar *label=dest.getBuffer()+labelStart;
+ // Ok to cast away const because we own the UnicodeString.
+ UChar *s=(UChar *)label+4; // After the initial "xn--".
+ const UChar *limit=label+labelLength;
+ do {
+ UChar c=*s;
+ if(c<=0x7f) {
+ if(c==0x2e) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
+ *s=0xfffd;
+ isASCII=onlyLDH=FALSE;
+ } else if(asciiData[c]<0) {
+ onlyLDH=FALSE;
+ if(disallowNonLDHDot) {
+ *s=0xfffd;
+ isASCII=FALSE;
+ }
+ }
+ } else {
+ isASCII=onlyLDH=FALSE;
+ }
+ } while(++s63) {
+ info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
+ }
+ }
+ return labelLength;
+}
+
+const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
+const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
+const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK;
+
+const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER);
+
+const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER);
+const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
+const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER);
+
+const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
+ U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)|
+ U_MASK(U_COMMON_NUMBER_SEPARATOR)|
+ U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)|
+ U_MASK(U_OTHER_NEUTRAL)|
+ U_MASK(U_BOUNDARY_NEUTRAL)|
+ U_MASK(U_DIR_NON_SPACING_MARK);
+const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
+const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
+
+// We scan the whole label and check both for whether it contains RTL characters
+// and whether it passes the BiDi Rule.
+// In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
+// that a domain name is a BiDi domain name (has an RTL label) only after
+// processing several earlier labels.
+void
+UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
+ // IDNA2008 BiDi rule
+ // Get the directionality of the first character.
+ UChar32 c;
+ int32_t i=0;
+ U16_NEXT_UNSAFE(label, i, c);
+ uint32_t firstMask=U_MASK(u_charDirection(c));
+ // 1. The first character must be a character with BIDI property L, R
+ // or AL. If it has the R or AL property, it is an RTL label; if it
+ // has the L property, it is an LTR label.
+ if((firstMask&~L_R_AL_MASK)!=0) {
+ info.isOkBiDi=FALSE;
+ }
+ // Get the directionality of the last non-NSM character.
+ uint32_t lastMask;
+ for(;;) {
+ if(i>=labelLength) {
+ lastMask=firstMask;
+ break;
+ }
+ U16_PREV_UNSAFE(label, labelLength, c);
+ UCharDirection dir=u_charDirection(c);
+ if(dir!=U_DIR_NON_SPACING_MARK) {
+ lastMask=U_MASK(dir);
+ break;
+ }
+ }
+ // 3. In an RTL label, the end of the label must be a character with
+ // BIDI property R, AL, EN or AN, followed by zero or more
+ // characters with BIDI property NSM.
+ // 6. In an LTR label, the end of the label must be a character with
+ // BIDI property L or EN, followed by zero or more characters with
+ // BIDI property NSM.
+ if( (firstMask&L_MASK)!=0 ?
+ (lastMask&~L_EN_MASK)!=0 :
+ (lastMask&~R_AL_EN_AN_MASK)!=0
+ ) {
+ info.isOkBiDi=FALSE;
+ }
+ // Get the directionalities of the intervening characters.
+ uint32_t mask=0;
+ while(ilabelStart) {
+ c=s[i-1];
+ if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {
+ // Last character in the label is not an L or EN.
+ return FALSE;
+ }
+ }
+ labelStart=i+1;
+ } else if(i==labelStart) {
+ if(!(0x61<=c && c<=0x7a)) {
+ // First character in the label is not an L.
+ return FALSE;
+ }
+ } else {
+ if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
+ // Intermediate character in the label is a B, S or WS.
+ return FALSE;
+ }
+ }
+ }
+ return TRUE;
+}
+
+// UTF-8 version, called for source ASCII prefix.
+// Can contain uppercase A-Z.
+// s[length-1] must be the trailing dot.
+static UBool
+isASCIIOkBiDi(const char *s, int32_t length) {
+ int32_t labelStart=0;
+ for(int32_t i=0; ilabelStart) {
+ c=s[i-1];
+ if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) {
+ // Last character in the label is not an L or EN.
+ return FALSE;
+ }
+ }
+ labelStart=i+1;
+ } else if(i==labelStart) {
+ if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {
+ // First character in the label is not an L.
+ return FALSE;
+ }
+ } else {
+ if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
+ // Intermediate character in the label is a B, S or WS.
+ return FALSE;
+ }
+ }
+ }
+ return TRUE;
+}
+
+UBool
+UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
+ // [IDNA2008-Tables]
+ // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
+ for(int32_t i=0; i(IDNA::createUTS46Instance(options, *pErrorCode));
+}
+
+U_DRAFT void U_EXPORT2
+uidna_close(UIDNA *idna) {
+ delete reinterpret_cast(idna);
+}
+
+static UBool
+checkArgs(const void *label, int32_t length,
+ void *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
+ }
+ // sizeof(UIDNAInfo)=16 in the first API version.
+ if(pInfo==NULL || pInfo->size<16) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return FALSE;
+ }
+ if( (label==NULL ? length!=0 : length<-1) ||
+ (dest==NULL ? capacity!=0 : capacity<0) ||
+ (dest==label && label!=NULL)
+ ) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return FALSE;
+ }
+ // Set all *pInfo bytes to 0 except for the size field itself.
+ uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));
+ return TRUE;
+}
+
+static void
+idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
+ pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
+ pInfo->errors=info.getErrors();
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToASCII(const UIDNA *idna,
+ const UChar *label, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ UnicodeString src((UBool)(length<0), label, length);
+ UnicodeString destString(dest, 0, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->labelToASCII(src, destString, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return destString.extract(dest, capacity, *pErrorCode);
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToUnicode(const UIDNA *idna,
+ const UChar *label, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ UnicodeString src((UBool)(length<0), label, length);
+ UnicodeString destString(dest, 0, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->labelToUnicode(src, destString, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return destString.extract(dest, capacity, *pErrorCode);
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToASCII(const UIDNA *idna,
+ const UChar *name, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ UnicodeString src((UBool)(length<0), name, length);
+ UnicodeString destString(dest, 0, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->nameToASCII(src, destString, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return destString.extract(dest, capacity, *pErrorCode);
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToUnicode(const UIDNA *idna,
+ const UChar *name, int32_t length,
+ UChar *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ UnicodeString src((UBool)(length<0), name, length);
+ UnicodeString destString(dest, 0, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->nameToUnicode(src, destString, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return destString.extract(dest, capacity, *pErrorCode);
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToASCII_UTF8(const UIDNA *idna,
+ const char *label, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ StringPiece src(label, length<0 ? uprv_strlen(label) : length);
+ CheckedArrayByteSink sink(dest, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_labelToUnicodeUTF8(const UIDNA *idna,
+ const char *label, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ StringPiece src(label, length<0 ? uprv_strlen(label) : length);
+ CheckedArrayByteSink sink(dest, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToASCII_UTF8(const UIDNA *idna,
+ const char *name, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ StringPiece src(name, length<0 ? uprv_strlen(name) : length);
+ CheckedArrayByteSink sink(dest, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
+}
+
+U_DRAFT int32_t U_EXPORT2
+uidna_nameToUnicodeUTF8(const UIDNA *idna,
+ const char *name, int32_t length,
+ char *dest, int32_t capacity,
+ UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
+ if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
+ return 0;
+ }
+ StringPiece src(name, length<0 ? uprv_strlen(name) : length);
+ CheckedArrayByteSink sink(dest, capacity);
+ IDNAInfo info;
+ reinterpret_cast(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode);
+ idnaInfoToStruct(info, pInfo);
+ return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
+}
+
+#endif // UCONFIG_NO_IDNA
diff --git a/icu4c/source/data/Makefile.in b/icu4c/source/data/Makefile.in
index 5e507fc850..82ec1c55a9 100644
--- a/icu4c/source/data/Makefile.in
+++ b/icu4c/source/data/Makefile.in
@@ -227,7 +227,7 @@ package390: $(OUTTMPDIR)/icudata390.lst $(PKGDATA_LIST) ./icupkg.inc packagedata
# 2005-may-05 Removed Unicode properties files (unorm.icu, uprops.icu, ucase.icu, ubidi.icu)
# from data build. See Jitterbug 4497. (makedata.mak revision 1.117)
#
-DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm
+DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm uts46.nrm
DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%)
## BRK files
diff --git a/icu4c/source/data/in/uts46.nrm b/icu4c/source/data/in/uts46.nrm
new file mode 100644
index 0000000000..91f7cf4035
Binary files /dev/null and b/icu4c/source/data/in/uts46.nrm differ
diff --git a/icu4c/source/data/makedata.mak b/icu4c/source/data/makedata.mak
index 0563781c6c..173658b3f0 100644
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@@ -586,7 +586,7 @@ icu4j-data-install :
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
-@erase "$(ICUTMP)\$(ICUPKG).dat"
!ELSE
-"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
+"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
@echo Building icu data
cd "$(ICUBLD_PKG)"
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
@@ -599,6 +599,7 @@ cnvalias.icu
nfc.nrm
nfkc.nrm
nfkc_cf.nrm
+uts46.nrm
$(CNV_FILES:.cnv =.cnv
)
$(CNV_FILES_SPECIAL:.cnv =.cnv
@@ -945,6 +946,9 @@ res_index:table(nofallback) {
"$(ICUBLD_PKG)\nfkc_cf.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\nfkc_cf.nrm
"$(ICUPBIN)\icupkg" -tl $? $@
+"$(ICUBLD_PKG)\uts46.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\uts46.nrm
+ "$(ICUPBIN)\icupkg" -tl $? $@
+
"$(ICUBLD_PKG)\coll\invuca.icu": $(ICUSRCDATA_RELATIVE_PATH)\in\coll\invuca.icu
"$(ICUPBIN)\icupkg" -tl $? $@
diff --git a/icu4c/source/data/unidata/norm2/uts46.txt b/icu4c/source/data/unidata/norm2/uts46.txt
new file mode 100644
index 0000000000..6420182a8a
--- /dev/null
+++ b/icu4c/source/data/unidata/norm2/uts46.txt
@@ -0,0 +1,6618 @@
+# Original file:
+# IdnaMappingTable-5.2.0.txt- DRAFT
+# Date: 2010-03-31 20:48:03 GMT [MD]
+#
+# Unicode IDNA Compatible Preprocessing (UTS #46)
+# Copyright (c) 1991-2009 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For documentation, see http://www.unicode.org/reports/tr46/
+
+# ================================================
+# This file has been reformatted into syntax for the
+# gennorm2 Normalizer2 data generator tool.
+# Reformatting via regular expressions:
+# s/; disallowed />FFFD/
+# s/; ignored />/
+# s/^([^;]+) ; valid/# \1valid/
+# s/; mapped ; />/
+# s/^([^;]+) ; deviation/# \1deviation/
+# s/ +(\# [^\#]+)$/ \1/
+#
+# Except: Disallowed ASCII characters are passed through;
+# they are handled in code.
+# Deviation characters are also handled in code.
+#
+# A circular mapping FFFD>FFFD is avoided by rewriting the line that contains
+# ..FFFD to contain ..FFFC instead.
+#
+# Use this file as the second gennorm2 input file after nfc.txt.
+# ================================================
+
+# 0000..002C (allow ASCII) # ..COMMA
+# 002D..002E valid # HYPHEN-MINUS..FULL STOP
+# 002F (allow ASCII) # SOLIDUS
+# 0030..0039 valid # DIGIT ZERO..DIGIT NINE
+# 003A..0040 (allow ASCII) # COLON..COMMERCIAL AT
+0041 >0061 # LATIN CAPITAL LETTER A
+0042 >0062 # LATIN CAPITAL LETTER B
+0043 >0063 # LATIN CAPITAL LETTER C
+0044 >0064 # LATIN CAPITAL LETTER D
+0045 >0065 # LATIN CAPITAL LETTER E
+0046 >0066 # LATIN CAPITAL LETTER F
+0047 >0067 # LATIN CAPITAL LETTER G
+0048 >0068 # LATIN CAPITAL LETTER H
+0049 >0069 # LATIN CAPITAL LETTER I
+004A >006A # LATIN CAPITAL LETTER J
+004B >006B # LATIN CAPITAL LETTER K
+004C >006C # LATIN CAPITAL LETTER L
+004D >006D # LATIN CAPITAL LETTER M
+004E >006E # LATIN CAPITAL LETTER N
+004F >006F # LATIN CAPITAL LETTER O
+0050 >0070 # LATIN CAPITAL LETTER P
+0051 >0071 # LATIN CAPITAL LETTER Q
+0052 >0072 # LATIN CAPITAL LETTER R
+0053 >0073 # LATIN CAPITAL LETTER S
+0054 >0074 # LATIN CAPITAL LETTER T
+0055 >0075 # LATIN CAPITAL LETTER U
+0056 >0076 # LATIN CAPITAL LETTER V
+0057 >0077 # LATIN CAPITAL LETTER W
+0058 >0078 # LATIN CAPITAL LETTER X
+0059 >0079 # LATIN CAPITAL LETTER Y
+005A >007A # LATIN CAPITAL LETTER Z
+# 005B..0060 (allow ASCII) # LEFT SQUARE BRACKET..GRAVE ACCENT
+# 0061..007A valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+0080..00A0 >FFFD # (allow ASCII) LEFT CURLY BRACKET..NO-BREAK SPACE
+# 00A1..00A7 valid # INVERTED EXCLAMATION MARK..SECTION SIGN
+00A8 >FFFD # DIAERESIS
+# 00A9 valid # COPYRIGHT SIGN
+00AA >0061 # FEMININE ORDINAL INDICATOR
+# 00AB..00AC valid # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK..NOT SIGN
+00AD > # SOFT HYPHEN
+# 00AE valid # REGISTERED SIGN
+00AF >FFFD # MACRON
+# 00B0..00B1 valid # DEGREE SIGN..PLUS-MINUS SIGN
+00B2 >0032 # SUPERSCRIPT TWO
+00B3 >0033 # SUPERSCRIPT THREE
+00B4 >FFFD # ACUTE ACCENT
+00B5 >03BC # MICRO SIGN
+# 00B6..00B7 valid # PILCROW SIGN..MIDDLE DOT
+00B8 >FFFD # CEDILLA
+00B9 >0031 # SUPERSCRIPT ONE
+00BA >006F # MASCULINE ORDINAL INDICATOR
+# 00BB valid # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+00BC >0031 2044 0034 # VULGAR FRACTION ONE QUARTER
+00BD >0031 2044 0032 # VULGAR FRACTION ONE HALF
+00BE >0033 2044 0034 # VULGAR FRACTION THREE QUARTERS
+# 00BF valid # INVERTED QUESTION MARK
+00C0 >00E0 # LATIN CAPITAL LETTER A WITH GRAVE
+00C1 >00E1 # LATIN CAPITAL LETTER A WITH ACUTE
+00C2 >00E2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
+00C3 >00E3 # LATIN CAPITAL LETTER A WITH TILDE
+00C4 >00E4 # LATIN CAPITAL LETTER A WITH DIAERESIS
+00C5 >00E5 # LATIN CAPITAL LETTER A WITH RING ABOVE
+00C6 >00E6 # LATIN CAPITAL LETTER AE
+00C7 >00E7 # LATIN CAPITAL LETTER C WITH CEDILLA
+00C8 >00E8 # LATIN CAPITAL LETTER E WITH GRAVE
+00C9 >00E9 # LATIN CAPITAL LETTER E WITH ACUTE
+00CA >00EA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
+00CB >00EB # LATIN CAPITAL LETTER E WITH DIAERESIS
+00CC >00EC # LATIN CAPITAL LETTER I WITH GRAVE
+00CD >00ED # LATIN CAPITAL LETTER I WITH ACUTE
+00CE >00EE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
+00CF >00EF # LATIN CAPITAL LETTER I WITH DIAERESIS
+00D0 >00F0 # LATIN CAPITAL LETTER ETH
+00D1 >00F1 # LATIN CAPITAL LETTER N WITH TILDE
+00D2 >00F2 # LATIN CAPITAL LETTER O WITH GRAVE
+00D3 >00F3 # LATIN CAPITAL LETTER O WITH ACUTE
+00D4 >00F4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
+00D5 >00F5 # LATIN CAPITAL LETTER O WITH TILDE
+00D6 >00F6 # LATIN CAPITAL LETTER O WITH DIAERESIS
+# 00D7 valid # MULTIPLICATION SIGN
+00D8 >00F8 # LATIN CAPITAL LETTER O WITH STROKE
+00D9 >00F9 # LATIN CAPITAL LETTER U WITH GRAVE
+00DA >00FA # LATIN CAPITAL LETTER U WITH ACUTE
+00DB >00FB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
+00DC >00FC # LATIN CAPITAL LETTER U WITH DIAERESIS
+00DD >00FD # LATIN CAPITAL LETTER Y WITH ACUTE
+00DE >00FE # LATIN CAPITAL LETTER THORN
+# 00DF deviation ; 0073 0073 # LATIN SMALL LETTER SHARP S
+# 00E0..00FF valid # LATIN SMALL LETTER A WITH GRAVE..LATIN SMALL LETTER Y WITH DIAERESIS
+0100 >0101 # LATIN CAPITAL LETTER A WITH MACRON
+# 0101 valid # LATIN SMALL LETTER A WITH MACRON
+0102 >0103 # LATIN CAPITAL LETTER A WITH BREVE
+# 0103 valid # LATIN SMALL LETTER A WITH BREVE
+0104 >0105 # LATIN CAPITAL LETTER A WITH OGONEK
+# 0105 valid # LATIN SMALL LETTER A WITH OGONEK
+0106 >0107 # LATIN CAPITAL LETTER C WITH ACUTE
+# 0107 valid # LATIN SMALL LETTER C WITH ACUTE
+0108 >0109 # LATIN CAPITAL LETTER C WITH CIRCUMFLEX
+# 0109 valid # LATIN SMALL LETTER C WITH CIRCUMFLEX
+010A >010B # LATIN CAPITAL LETTER C WITH DOT ABOVE
+# 010B valid # LATIN SMALL LETTER C WITH DOT ABOVE
+010C >010D # LATIN CAPITAL LETTER C WITH CARON
+# 010D valid # LATIN SMALL LETTER C WITH CARON
+010E >010F # LATIN CAPITAL LETTER D WITH CARON
+# 010F valid # LATIN SMALL LETTER D WITH CARON
+0110 >0111 # LATIN CAPITAL LETTER D WITH STROKE
+# 0111 valid # LATIN SMALL LETTER D WITH STROKE
+0112 >0113 # LATIN CAPITAL LETTER E WITH MACRON
+# 0113 valid # LATIN SMALL LETTER E WITH MACRON
+0114 >0115 # LATIN CAPITAL LETTER E WITH BREVE
+# 0115 valid # LATIN SMALL LETTER E WITH BREVE
+0116 >0117 # LATIN CAPITAL LETTER E WITH DOT ABOVE
+# 0117 valid # LATIN SMALL LETTER E WITH DOT ABOVE
+0118 >0119 # LATIN CAPITAL LETTER E WITH OGONEK
+# 0119 valid # LATIN SMALL LETTER E WITH OGONEK
+011A >011B # LATIN CAPITAL LETTER E WITH CARON
+# 011B valid # LATIN SMALL LETTER E WITH CARON
+011C >011D # LATIN CAPITAL LETTER G WITH CIRCUMFLEX
+# 011D valid # LATIN SMALL LETTER G WITH CIRCUMFLEX
+011E >011F # LATIN CAPITAL LETTER G WITH BREVE
+# 011F valid # LATIN SMALL LETTER G WITH BREVE
+0120 >0121 # LATIN CAPITAL LETTER G WITH DOT ABOVE
+# 0121 valid # LATIN SMALL LETTER G WITH DOT ABOVE
+0122 >0123 # LATIN CAPITAL LETTER G WITH CEDILLA
+# 0123 valid # LATIN SMALL LETTER G WITH CEDILLA
+0124 >0125 # LATIN CAPITAL LETTER H WITH CIRCUMFLEX
+# 0125 valid # LATIN SMALL LETTER H WITH CIRCUMFLEX
+0126 >0127 # LATIN CAPITAL LETTER H WITH STROKE
+# 0127 valid # LATIN SMALL LETTER H WITH STROKE
+0128 >0129 # LATIN CAPITAL LETTER I WITH TILDE
+# 0129 valid # LATIN SMALL LETTER I WITH TILDE
+012A >012B # LATIN CAPITAL LETTER I WITH MACRON
+# 012B valid # LATIN SMALL LETTER I WITH MACRON
+012C >012D # LATIN CAPITAL LETTER I WITH BREVE
+# 012D valid # LATIN SMALL LETTER I WITH BREVE
+012E >012F # LATIN CAPITAL LETTER I WITH OGONEK
+# 012F valid # LATIN SMALL LETTER I WITH OGONEK
+0130 >0069 0307 # LATIN CAPITAL LETTER I WITH DOT ABOVE
+# 0131 valid # LATIN SMALL LETTER DOTLESS I
+0132..0133 >0069 006A # LATIN CAPITAL LIGATURE IJ..LATIN SMALL LIGATURE IJ
+0134 >0135 # LATIN CAPITAL LETTER J WITH CIRCUMFLEX
+# 0135 valid # LATIN SMALL LETTER J WITH CIRCUMFLEX
+0136 >0137 # LATIN CAPITAL LETTER K WITH CEDILLA
+# 0137..0138 valid # LATIN SMALL LETTER K WITH CEDILLA..LATIN SMALL LETTER KRA
+0139 >013A # LATIN CAPITAL LETTER L WITH ACUTE
+# 013A valid # LATIN SMALL LETTER L WITH ACUTE
+013B >013C # LATIN CAPITAL LETTER L WITH CEDILLA
+# 013C valid # LATIN SMALL LETTER L WITH CEDILLA
+013D >013E # LATIN CAPITAL LETTER L WITH CARON
+# 013E valid # LATIN SMALL LETTER L WITH CARON
+013F..0140 >006C 00B7 # LATIN CAPITAL LETTER L WITH MIDDLE DOT..LATIN SMALL LETTER L WITH MIDDLE DOT
+0141 >0142 # LATIN CAPITAL LETTER L WITH STROKE
+# 0142 valid # LATIN SMALL LETTER L WITH STROKE
+0143 >0144 # LATIN CAPITAL LETTER N WITH ACUTE
+# 0144 valid # LATIN SMALL LETTER N WITH ACUTE
+0145 >0146 # LATIN CAPITAL LETTER N WITH CEDILLA
+# 0146 valid # LATIN SMALL LETTER N WITH CEDILLA
+0147 >0148 # LATIN CAPITAL LETTER N WITH CARON
+# 0148 valid # LATIN SMALL LETTER N WITH CARON
+0149 >02BC 006E # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+014A >014B # LATIN CAPITAL LETTER ENG
+# 014B valid # LATIN SMALL LETTER ENG
+014C >014D # LATIN CAPITAL LETTER O WITH MACRON
+# 014D valid # LATIN SMALL LETTER O WITH MACRON
+014E >014F # LATIN CAPITAL LETTER O WITH BREVE
+# 014F valid # LATIN SMALL LETTER O WITH BREVE
+0150 >0151 # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
+# 0151 valid # LATIN SMALL LETTER O WITH DOUBLE ACUTE
+0152 >0153 # LATIN CAPITAL LIGATURE OE
+# 0153 valid # LATIN SMALL LIGATURE OE
+0154 >0155 # LATIN CAPITAL LETTER R WITH ACUTE
+# 0155 valid # LATIN SMALL LETTER R WITH ACUTE
+0156 >0157 # LATIN CAPITAL LETTER R WITH CEDILLA
+# 0157 valid # LATIN SMALL LETTER R WITH CEDILLA
+0158 >0159 # LATIN CAPITAL LETTER R WITH CARON
+# 0159 valid # LATIN SMALL LETTER R WITH CARON
+015A >015B # LATIN CAPITAL LETTER S WITH ACUTE
+# 015B valid # LATIN SMALL LETTER S WITH ACUTE
+015C >015D # LATIN CAPITAL LETTER S WITH CIRCUMFLEX
+# 015D valid # LATIN SMALL LETTER S WITH CIRCUMFLEX
+015E >015F # LATIN CAPITAL LETTER S WITH CEDILLA
+# 015F valid # LATIN SMALL LETTER S WITH CEDILLA
+0160 >0161 # LATIN CAPITAL LETTER S WITH CARON
+# 0161 valid # LATIN SMALL LETTER S WITH CARON
+0162 >0163 # LATIN CAPITAL LETTER T WITH CEDILLA
+# 0163 valid # LATIN SMALL LETTER T WITH CEDILLA
+0164 >0165 # LATIN CAPITAL LETTER T WITH CARON
+# 0165 valid # LATIN SMALL LETTER T WITH CARON
+0166 >0167 # LATIN CAPITAL LETTER T WITH STROKE
+# 0167 valid # LATIN SMALL LETTER T WITH STROKE
+0168 >0169 # LATIN CAPITAL LETTER U WITH TILDE
+# 0169 valid # LATIN SMALL LETTER U WITH TILDE
+016A >016B # LATIN CAPITAL LETTER U WITH MACRON
+# 016B valid # LATIN SMALL LETTER U WITH MACRON
+016C >016D # LATIN CAPITAL LETTER U WITH BREVE
+# 016D valid # LATIN SMALL LETTER U WITH BREVE
+016E >016F # LATIN CAPITAL LETTER U WITH RING ABOVE
+# 016F valid # LATIN SMALL LETTER U WITH RING ABOVE
+0170 >0171 # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
+# 0171 valid # LATIN SMALL LETTER U WITH DOUBLE ACUTE
+0172 >0173 # LATIN CAPITAL LETTER U WITH OGONEK
+# 0173 valid # LATIN SMALL LETTER U WITH OGONEK
+0174 >0175 # LATIN CAPITAL LETTER W WITH CIRCUMFLEX
+# 0175 valid # LATIN SMALL LETTER W WITH CIRCUMFLEX
+0176 >0177 # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
+# 0177 valid # LATIN SMALL LETTER Y WITH CIRCUMFLEX
+0178 >00FF # LATIN CAPITAL LETTER Y WITH DIAERESIS
+0179 >017A # LATIN CAPITAL LETTER Z WITH ACUTE
+# 017A valid # LATIN SMALL LETTER Z WITH ACUTE
+017B >017C # LATIN CAPITAL LETTER Z WITH DOT ABOVE
+# 017C valid # LATIN SMALL LETTER Z WITH DOT ABOVE
+017D >017E # LATIN CAPITAL LETTER Z WITH CARON
+# 017E valid # LATIN SMALL LETTER Z WITH CARON
+017F >0073 # LATIN SMALL LETTER LONG S
+# 0180 valid # LATIN SMALL LETTER B WITH STROKE
+0181 >0253 # LATIN CAPITAL LETTER B WITH HOOK
+0182 >0183 # LATIN CAPITAL LETTER B WITH TOPBAR
+# 0183 valid # LATIN SMALL LETTER B WITH TOPBAR
+0184 >0185 # LATIN CAPITAL LETTER TONE SIX
+# 0185 valid # LATIN SMALL LETTER TONE SIX
+0186 >0254 # LATIN CAPITAL LETTER OPEN O
+0187 >0188 # LATIN CAPITAL LETTER C WITH HOOK
+# 0188 valid # LATIN SMALL LETTER C WITH HOOK
+0189 >0256 # LATIN CAPITAL LETTER AFRICAN D
+018A >0257 # LATIN CAPITAL LETTER D WITH HOOK
+018B >018C # LATIN CAPITAL LETTER D WITH TOPBAR
+# 018C..018D valid # LATIN SMALL LETTER D WITH TOPBAR..LATIN SMALL LETTER TURNED DELTA
+018E >01DD # LATIN CAPITAL LETTER REVERSED E
+018F >0259 # LATIN CAPITAL LETTER SCHWA
+0190 >025B # LATIN CAPITAL LETTER OPEN E
+0191 >0192 # LATIN CAPITAL LETTER F WITH HOOK
+# 0192 valid # LATIN SMALL LETTER F WITH HOOK
+0193 >0260 # LATIN CAPITAL LETTER G WITH HOOK
+0194 >0263 # LATIN CAPITAL LETTER GAMMA
+# 0195 valid # LATIN SMALL LETTER HV
+0196 >0269 # LATIN CAPITAL LETTER IOTA
+0197 >0268 # LATIN CAPITAL LETTER I WITH STROKE
+0198 >0199 # LATIN CAPITAL LETTER K WITH HOOK
+# 0199..019B valid # LATIN SMALL LETTER K WITH HOOK..LATIN SMALL LETTER LAMBDA WITH STROKE
+019C >026F # LATIN CAPITAL LETTER TURNED M
+019D >0272 # LATIN CAPITAL LETTER N WITH LEFT HOOK
+# 019E valid # LATIN SMALL LETTER N WITH LONG RIGHT LEG
+019F >0275 # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
+01A0 >01A1 # LATIN CAPITAL LETTER O WITH HORN
+# 01A1 valid # LATIN SMALL LETTER O WITH HORN
+01A2 >01A3 # LATIN CAPITAL LETTER OI
+# 01A3 valid # LATIN SMALL LETTER OI
+01A4 >01A5 # LATIN CAPITAL LETTER P WITH HOOK
+# 01A5 valid # LATIN SMALL LETTER P WITH HOOK
+01A6 >0280 # LATIN LETTER YR
+01A7 >01A8 # LATIN CAPITAL LETTER TONE TWO
+# 01A8 valid # LATIN SMALL LETTER TONE TWO
+01A9 >0283 # LATIN CAPITAL LETTER ESH
+# 01AA..01AB valid # LATIN LETTER REVERSED ESH LOOP..LATIN SMALL LETTER T WITH PALATAL HOOK
+01AC >01AD # LATIN CAPITAL LETTER T WITH HOOK
+# 01AD valid # LATIN SMALL LETTER T WITH HOOK
+01AE >0288 # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
+01AF >01B0 # LATIN CAPITAL LETTER U WITH HORN
+# 01B0 valid # LATIN SMALL LETTER U WITH HORN
+01B1 >028A # LATIN CAPITAL LETTER UPSILON
+01B2 >028B # LATIN CAPITAL LETTER V WITH HOOK
+01B3 >01B4 # LATIN CAPITAL LETTER Y WITH HOOK
+# 01B4 valid # LATIN SMALL LETTER Y WITH HOOK
+01B5 >01B6 # LATIN CAPITAL LETTER Z WITH STROKE
+# 01B6 valid # LATIN SMALL LETTER Z WITH STROKE
+01B7 >0292 # LATIN CAPITAL LETTER EZH
+01B8 >01B9 # LATIN CAPITAL LETTER EZH REVERSED
+# 01B9..01BB valid # LATIN SMALL LETTER EZH REVERSED..LATIN LETTER TWO WITH STROKE
+01BC >01BD # LATIN CAPITAL LETTER TONE FIVE
+# 01BD..01C3 valid # LATIN SMALL LETTER TONE FIVE..LATIN LETTER RETROFLEX CLICK
+01C4..01C6 >0064 017E # LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER DZ WITH CARON
+01C7..01C9 >006C 006A # LATIN CAPITAL LETTER LJ..LATIN SMALL LETTER LJ
+01CA..01CC >006E 006A # LATIN CAPITAL LETTER NJ..LATIN SMALL LETTER NJ
+01CD >01CE # LATIN CAPITAL LETTER A WITH CARON
+# 01CE valid # LATIN SMALL LETTER A WITH CARON
+01CF >01D0 # LATIN CAPITAL LETTER I WITH CARON
+# 01D0 valid # LATIN SMALL LETTER I WITH CARON
+01D1 >01D2 # LATIN CAPITAL LETTER O WITH CARON
+# 01D2 valid # LATIN SMALL LETTER O WITH CARON
+01D3 >01D4 # LATIN CAPITAL LETTER U WITH CARON
+# 01D4 valid # LATIN SMALL LETTER U WITH CARON
+01D5 >01D6 # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
+# 01D6 valid # LATIN SMALL LETTER U WITH DIAERESIS AND MACRON
+01D7 >01D8 # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE
+# 01D8 valid # LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE
+01D9 >01DA # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON
+# 01DA valid # LATIN SMALL LETTER U WITH DIAERESIS AND CARON
+01DB >01DC # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE
+# 01DC..01DD valid # LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE..LATIN SMALL LETTER TURNED E
+01DE >01DF # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON
+# 01DF valid # LATIN SMALL LETTER A WITH DIAERESIS AND MACRON
+01E0 >01E1 # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON
+# 01E1 valid # LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON
+01E2 >01E3 # LATIN CAPITAL LETTER AE WITH MACRON
+# 01E3 valid # LATIN SMALL LETTER AE WITH MACRON
+01E4 >01E5 # LATIN CAPITAL LETTER G WITH STROKE
+# 01E5 valid # LATIN SMALL LETTER G WITH STROKE
+01E6 >01E7 # LATIN CAPITAL LETTER G WITH CARON
+# 01E7 valid # LATIN SMALL LETTER G WITH CARON
+01E8 >01E9 # LATIN CAPITAL LETTER K WITH CARON
+# 01E9 valid # LATIN SMALL LETTER K WITH CARON
+01EA >01EB # LATIN CAPITAL LETTER O WITH OGONEK
+# 01EB valid # LATIN SMALL LETTER O WITH OGONEK
+01EC >01ED # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON
+# 01ED valid # LATIN SMALL LETTER O WITH OGONEK AND MACRON
+01EE >01EF # LATIN CAPITAL LETTER EZH WITH CARON
+# 01EF..01F0 valid # LATIN SMALL LETTER EZH WITH CARON..LATIN SMALL LETTER J WITH CARON
+01F1..01F3 >0064 007A # LATIN CAPITAL LETTER DZ..LATIN SMALL LETTER DZ
+01F4 >01F5 # LATIN CAPITAL LETTER G WITH ACUTE
+# 01F5 valid # LATIN SMALL LETTER G WITH ACUTE
+01F6 >0195 # LATIN CAPITAL LETTER HWAIR
+01F7 >01BF # LATIN CAPITAL LETTER WYNN
+01F8 >01F9 # LATIN CAPITAL LETTER N WITH GRAVE
+# 01F9 valid # LATIN SMALL LETTER N WITH GRAVE
+01FA >01FB # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE
+# 01FB valid # LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE
+01FC >01FD # LATIN CAPITAL LETTER AE WITH ACUTE
+# 01FD valid # LATIN SMALL LETTER AE WITH ACUTE
+01FE >01FF # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
+# 01FF valid # LATIN SMALL LETTER O WITH STROKE AND ACUTE
+0200 >0201 # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
+# 0201 valid # LATIN SMALL LETTER A WITH DOUBLE GRAVE
+0202 >0203 # LATIN CAPITAL LETTER A WITH INVERTED BREVE
+# 0203 valid # LATIN SMALL LETTER A WITH INVERTED BREVE
+0204 >0205 # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
+# 0205 valid # LATIN SMALL LETTER E WITH DOUBLE GRAVE
+0206 >0207 # LATIN CAPITAL LETTER E WITH INVERTED BREVE
+# 0207 valid # LATIN SMALL LETTER E WITH INVERTED BREVE
+0208 >0209 # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
+# 0209 valid # LATIN SMALL LETTER I WITH DOUBLE GRAVE
+020A >020B # LATIN CAPITAL LETTER I WITH INVERTED BREVE
+# 020B valid # LATIN SMALL LETTER I WITH INVERTED BREVE
+020C >020D # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
+# 020D valid # LATIN SMALL LETTER O WITH DOUBLE GRAVE
+020E >020F # LATIN CAPITAL LETTER O WITH INVERTED BREVE
+# 020F valid # LATIN SMALL LETTER O WITH INVERTED BREVE
+0210 >0211 # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
+# 0211 valid # LATIN SMALL LETTER R WITH DOUBLE GRAVE
+0212 >0213 # LATIN CAPITAL LETTER R WITH INVERTED BREVE
+# 0213 valid # LATIN SMALL LETTER R WITH INVERTED BREVE
+0214 >0215 # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE
+# 0215 valid # LATIN SMALL LETTER U WITH DOUBLE GRAVE
+0216 >0217 # LATIN CAPITAL LETTER U WITH INVERTED BREVE
+# 0217 valid # LATIN SMALL LETTER U WITH INVERTED BREVE
+0218 >0219 # LATIN CAPITAL LETTER S WITH COMMA BELOW
+# 0219 valid # LATIN SMALL LETTER S WITH COMMA BELOW
+021A >021B # LATIN CAPITAL LETTER T WITH COMMA BELOW
+# 021B valid # LATIN SMALL LETTER T WITH COMMA BELOW
+021C >021D # LATIN CAPITAL LETTER YOGH
+# 021D valid # LATIN SMALL LETTER YOGH
+021E >021F # LATIN CAPITAL LETTER H WITH CARON
+# 021F valid # LATIN SMALL LETTER H WITH CARON
+0220 >019E # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
+# 0221 valid # LATIN SMALL LETTER D WITH CURL
+0222 >0223 # LATIN CAPITAL LETTER OU
+# 0223 valid # LATIN SMALL LETTER OU
+0224 >0225 # LATIN CAPITAL LETTER Z WITH HOOK
+# 0225 valid # LATIN SMALL LETTER Z WITH HOOK
+0226 >0227 # LATIN CAPITAL LETTER A WITH DOT ABOVE
+# 0227 valid # LATIN SMALL LETTER A WITH DOT ABOVE
+0228 >0229 # LATIN CAPITAL LETTER E WITH CEDILLA
+# 0229 valid # LATIN SMALL LETTER E WITH CEDILLA
+022A >022B # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON
+# 022B valid # LATIN SMALL LETTER O WITH DIAERESIS AND MACRON
+022C >022D # LATIN CAPITAL LETTER O WITH TILDE AND MACRON
+# 022D valid # LATIN SMALL LETTER O WITH TILDE AND MACRON
+022E >022F # LATIN CAPITAL LETTER O WITH DOT ABOVE
+# 022F valid # LATIN SMALL LETTER O WITH DOT ABOVE
+0230 >0231 # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON
+# 0231 valid # LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON
+0232 >0233 # LATIN CAPITAL LETTER Y WITH MACRON
+# 0233..0239 valid # LATIN SMALL LETTER Y WITH MACRON..LATIN SMALL LETTER QP DIGRAPH
+023A >2C65 # LATIN CAPITAL LETTER A WITH STROKE
+023B >023C # LATIN CAPITAL LETTER C WITH STROKE
+# 023C valid # LATIN SMALL LETTER C WITH STROKE
+023D >019A # LATIN CAPITAL LETTER L WITH BAR
+023E >2C66 # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
+# 023F..0240 valid # LATIN SMALL LETTER S WITH SWASH TAIL..LATIN SMALL LETTER Z WITH SWASH TAIL
+0241 >0242 # LATIN CAPITAL LETTER GLOTTAL STOP
+# 0242 valid # LATIN SMALL LETTER GLOTTAL STOP
+0243 >0180 # LATIN CAPITAL LETTER B WITH STROKE
+0244 >0289 # LATIN CAPITAL LETTER U BAR
+0245 >028C # LATIN CAPITAL LETTER TURNED V
+0246 >0247 # LATIN CAPITAL LETTER E WITH STROKE
+# 0247 valid # LATIN SMALL LETTER E WITH STROKE
+0248 >0249 # LATIN CAPITAL LETTER J WITH STROKE
+# 0249 valid # LATIN SMALL LETTER J WITH STROKE
+024A >024B # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL
+# 024B valid # LATIN SMALL LETTER Q WITH HOOK TAIL
+024C >024D # LATIN CAPITAL LETTER R WITH STROKE
+# 024D valid # LATIN SMALL LETTER R WITH STROKE
+024E >024F # LATIN CAPITAL LETTER Y WITH STROKE
+# 024F..02AF valid # LATIN SMALL LETTER Y WITH STROKE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
+02B0 >0068 # MODIFIER LETTER SMALL H
+02B1 >0266 # MODIFIER LETTER SMALL H WITH HOOK
+02B2 >006A # MODIFIER LETTER SMALL J
+02B3 >0072 # MODIFIER LETTER SMALL R
+02B4 >0279 # MODIFIER LETTER SMALL TURNED R
+02B5 >027B # MODIFIER LETTER SMALL TURNED R WITH HOOK
+02B6 >0281 # MODIFIER LETTER SMALL CAPITAL INVERTED R
+02B7 >0077 # MODIFIER LETTER SMALL W
+02B8 >0079 # MODIFIER LETTER SMALL Y
+# 02B9..02D7 valid # MODIFIER LETTER PRIME..MODIFIER LETTER MINUS SIGN
+02D8..02DD >FFFD # BREVE..DOUBLE ACUTE ACCENT
+# 02DE..02DF valid # MODIFIER LETTER RHOTIC HOOK..MODIFIER LETTER CROSS ACCENT
+02E0 >0263 # MODIFIER LETTER SMALL GAMMA
+02E1 >006C # MODIFIER LETTER SMALL L
+02E2 >0073 # MODIFIER LETTER SMALL S
+02E3 >0078 # MODIFIER LETTER SMALL X
+02E4 >0295 # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
+# 02E5..033F valid # MODIFIER LETTER EXTRA-HIGH TONE BAR..COMBINING DOUBLE OVERLINE
+0340 >0300 # COMBINING GRAVE TONE MARK
+0341 >0301 # COMBINING ACUTE TONE MARK
+# 0342 valid # COMBINING GREEK PERISPOMENI
+0343 >0313 # COMBINING GREEK KORONIS
+0344 >0308 0301 # COMBINING GREEK DIALYTIKA TONOS
+0345 >03B9 # COMBINING GREEK YPOGEGRAMMENI
+# 0346..034E valid # COMBINING BRIDGE ABOVE..COMBINING UPWARDS ARROW BELOW
+034F > # COMBINING GRAPHEME JOINER
+# 0350..036F valid # COMBINING RIGHT ARROWHEAD ABOVE..COMBINING LATIN SMALL LETTER X
+0370 >0371 # GREEK CAPITAL LETTER HETA
+# 0371 valid # GREEK SMALL LETTER HETA
+0372 >0373 # GREEK CAPITAL LETTER ARCHAIC SAMPI
+# 0373 valid # GREEK SMALL LETTER ARCHAIC SAMPI
+0374 >02B9 # GREEK NUMERAL SIGN
+# 0375 valid # GREEK LOWER NUMERAL SIGN
+0376 >0377 # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
+# 0377 valid # GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
+0378..0379 >FFFD # ..
+037A >FFFD # GREEK YPOGEGRAMMENI
+# 037B..037D valid # GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
+037E >FFFD # GREEK QUESTION MARK
+037F..0383 >FFFD # ..
+0384..0385 >FFFD # GREEK TONOS..GREEK DIALYTIKA TONOS
+0386 >03AC # GREEK CAPITAL LETTER ALPHA WITH TONOS
+0387 >00B7 # GREEK ANO TELEIA
+0388 >03AD # GREEK CAPITAL LETTER EPSILON WITH TONOS
+0389 >03AE # GREEK CAPITAL LETTER ETA WITH TONOS
+038A >03AF # GREEK CAPITAL LETTER IOTA WITH TONOS
+038B >FFFD #
+038C >03CC # GREEK CAPITAL LETTER OMICRON WITH TONOS
+038D >FFFD #
+038E >03CD # GREEK CAPITAL LETTER UPSILON WITH TONOS
+038F >03CE # GREEK CAPITAL LETTER OMEGA WITH TONOS
+# 0390 valid # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+0391 >03B1 # GREEK CAPITAL LETTER ALPHA
+0392 >03B2 # GREEK CAPITAL LETTER BETA
+0393 >03B3 # GREEK CAPITAL LETTER GAMMA
+0394 >03B4 # GREEK CAPITAL LETTER DELTA
+0395 >03B5 # GREEK CAPITAL LETTER EPSILON
+0396 >03B6 # GREEK CAPITAL LETTER ZETA
+0397 >03B7 # GREEK CAPITAL LETTER ETA
+0398 >03B8 # GREEK CAPITAL LETTER THETA
+0399 >03B9 # GREEK CAPITAL LETTER IOTA
+039A >03BA # GREEK CAPITAL LETTER KAPPA
+039B >03BB # GREEK CAPITAL LETTER LAMDA
+039C >03BC # GREEK CAPITAL LETTER MU
+039D >03BD # GREEK CAPITAL LETTER NU
+039E >03BE # GREEK CAPITAL LETTER XI
+039F >03BF # GREEK CAPITAL LETTER OMICRON
+03A0 >03C0 # GREEK CAPITAL LETTER PI
+03A1 >03C1 # GREEK CAPITAL LETTER RHO
+03A2 >FFFD #
+03A3 >03C3 # GREEK CAPITAL LETTER SIGMA
+03A4 >03C4 # GREEK CAPITAL LETTER TAU
+03A5 >03C5 # GREEK CAPITAL LETTER UPSILON
+03A6 >03C6 # GREEK CAPITAL LETTER PHI
+03A7 >03C7 # GREEK CAPITAL LETTER CHI
+03A8 >03C8 # GREEK CAPITAL LETTER PSI
+03A9 >03C9 # GREEK CAPITAL LETTER OMEGA
+03AA >03CA # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
+03AB >03CB # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
+# 03AC..03C1 valid # GREEK SMALL LETTER ALPHA WITH TONOS..GREEK SMALL LETTER RHO
+# 03C2 deviation ; 03C3 # GREEK SMALL LETTER FINAL SIGMA
+# 03C3..03CE valid # GREEK SMALL LETTER SIGMA..GREEK SMALL LETTER OMEGA WITH TONOS
+03CF >03D7 # GREEK CAPITAL KAI SYMBOL
+03D0 >03B2 # GREEK BETA SYMBOL
+03D1 >03B8 # GREEK THETA SYMBOL
+03D2 >03C5 # GREEK UPSILON WITH HOOK SYMBOL
+03D3 >03CD # GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
+03D4 >03CB # GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
+03D5 >03C6 # GREEK PHI SYMBOL
+03D6 >03C0 # GREEK PI SYMBOL
+# 03D7 valid # GREEK KAI SYMBOL
+03D8 >03D9 # GREEK LETTER ARCHAIC KOPPA
+# 03D9 valid # GREEK SMALL LETTER ARCHAIC KOPPA
+03DA >03DB # GREEK LETTER STIGMA
+# 03DB valid # GREEK SMALL LETTER STIGMA
+03DC >03DD # GREEK LETTER DIGAMMA
+# 03DD valid # GREEK SMALL LETTER DIGAMMA
+03DE >03DF # GREEK LETTER KOPPA
+# 03DF valid # GREEK SMALL LETTER KOPPA
+03E0 >03E1 # GREEK LETTER SAMPI
+# 03E1 valid # GREEK SMALL LETTER SAMPI
+03E2 >03E3 # COPTIC CAPITAL LETTER SHEI
+# 03E3 valid # COPTIC SMALL LETTER SHEI
+03E4 >03E5 # COPTIC CAPITAL LETTER FEI
+# 03E5 valid # COPTIC SMALL LETTER FEI
+03E6 >03E7 # COPTIC CAPITAL LETTER KHEI
+# 03E7 valid # COPTIC SMALL LETTER KHEI
+03E8 >03E9 # COPTIC CAPITAL LETTER HORI
+# 03E9 valid # COPTIC SMALL LETTER HORI
+03EA >03EB # COPTIC CAPITAL LETTER GANGIA
+# 03EB valid # COPTIC SMALL LETTER GANGIA
+03EC >03ED # COPTIC CAPITAL LETTER SHIMA
+# 03ED valid # COPTIC SMALL LETTER SHIMA
+03EE >03EF # COPTIC CAPITAL LETTER DEI
+# 03EF valid # COPTIC SMALL LETTER DEI
+03F0 >03BA # GREEK KAPPA SYMBOL
+03F1 >03C1 # GREEK RHO SYMBOL
+03F2 >03C3 # GREEK LUNATE SIGMA SYMBOL
+# 03F3 valid # GREEK LETTER YOT
+03F4 >03B8 # GREEK CAPITAL THETA SYMBOL
+03F5 >03B5 # GREEK LUNATE EPSILON SYMBOL
+# 03F6 valid # GREEK REVERSED LUNATE EPSILON SYMBOL
+03F7 >03F8 # GREEK CAPITAL LETTER SHO
+# 03F8 valid # GREEK SMALL LETTER SHO
+03F9 >03C3 # GREEK CAPITAL LUNATE SIGMA SYMBOL
+03FA >03FB # GREEK CAPITAL LETTER SAN
+# 03FB..03FC valid # GREEK SMALL LETTER SAN..GREEK RHO WITH STROKE SYMBOL
+03FD >037B # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL
+03FE >037C # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL
+03FF >037D # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
+0400 >0450 # CYRILLIC CAPITAL LETTER IE WITH GRAVE
+0401 >0451 # CYRILLIC CAPITAL LETTER IO
+0402 >0452 # CYRILLIC CAPITAL LETTER DJE
+0403 >0453 # CYRILLIC CAPITAL LETTER GJE
+0404 >0454 # CYRILLIC CAPITAL LETTER UKRAINIAN IE
+0405 >0455 # CYRILLIC CAPITAL LETTER DZE
+0406 >0456 # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
+0407 >0457 # CYRILLIC CAPITAL LETTER YI
+0408 >0458 # CYRILLIC CAPITAL LETTER JE
+0409 >0459 # CYRILLIC CAPITAL LETTER LJE
+040A >045A # CYRILLIC CAPITAL LETTER NJE
+040B >045B # CYRILLIC CAPITAL LETTER TSHE
+040C >045C # CYRILLIC CAPITAL LETTER KJE
+040D >045D # CYRILLIC CAPITAL LETTER I WITH GRAVE
+040E >045E # CYRILLIC CAPITAL LETTER SHORT U
+040F >045F # CYRILLIC CAPITAL LETTER DZHE
+0410 >0430 # CYRILLIC CAPITAL LETTER A
+0411 >0431 # CYRILLIC CAPITAL LETTER BE
+0412 >0432 # CYRILLIC CAPITAL LETTER VE
+0413 >0433 # CYRILLIC CAPITAL LETTER GHE
+0414 >0434 # CYRILLIC CAPITAL LETTER DE
+0415 >0435 # CYRILLIC CAPITAL LETTER IE
+0416 >0436 # CYRILLIC CAPITAL LETTER ZHE
+0417 >0437 # CYRILLIC CAPITAL LETTER ZE
+0418 >0438 # CYRILLIC CAPITAL LETTER I
+0419 >0439 # CYRILLIC CAPITAL LETTER SHORT I
+041A >043A # CYRILLIC CAPITAL LETTER KA
+041B >043B # CYRILLIC CAPITAL LETTER EL
+041C >043C # CYRILLIC CAPITAL LETTER EM
+041D >043D # CYRILLIC CAPITAL LETTER EN
+041E >043E # CYRILLIC CAPITAL LETTER O
+041F >043F # CYRILLIC CAPITAL LETTER PE
+0420 >0440 # CYRILLIC CAPITAL LETTER ER
+0421 >0441 # CYRILLIC CAPITAL LETTER ES
+0422 >0442 # CYRILLIC CAPITAL LETTER TE
+0423 >0443 # CYRILLIC CAPITAL LETTER U
+0424 >0444 # CYRILLIC CAPITAL LETTER EF
+0425 >0445 # CYRILLIC CAPITAL LETTER HA
+0426 >0446 # CYRILLIC CAPITAL LETTER TSE
+0427 >0447 # CYRILLIC CAPITAL LETTER CHE
+0428 >0448 # CYRILLIC CAPITAL LETTER SHA
+0429 >0449 # CYRILLIC CAPITAL LETTER SHCHA
+042A >044A # CYRILLIC CAPITAL LETTER HARD SIGN
+042B >044B # CYRILLIC CAPITAL LETTER YERU
+042C >044C # CYRILLIC CAPITAL LETTER SOFT SIGN
+042D >044D # CYRILLIC CAPITAL LETTER E
+042E >044E # CYRILLIC CAPITAL LETTER YU
+042F >044F # CYRILLIC CAPITAL LETTER YA
+# 0430..045F valid # CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETTER DZHE
+0460 >0461 # CYRILLIC CAPITAL LETTER OMEGA
+# 0461 valid # CYRILLIC SMALL LETTER OMEGA
+0462 >0463 # CYRILLIC CAPITAL LETTER YAT
+# 0463 valid # CYRILLIC SMALL LETTER YAT
+0464 >0465 # CYRILLIC CAPITAL LETTER IOTIFIED E
+# 0465 valid # CYRILLIC SMALL LETTER IOTIFIED E
+0466 >0467 # CYRILLIC CAPITAL LETTER LITTLE YUS
+# 0467 valid # CYRILLIC SMALL LETTER LITTLE YUS
+0468 >0469 # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS
+# 0469 valid # CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS
+046A >046B # CYRILLIC CAPITAL LETTER BIG YUS
+# 046B valid # CYRILLIC SMALL LETTER BIG YUS
+046C >046D # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS
+# 046D valid # CYRILLIC SMALL LETTER IOTIFIED BIG YUS
+046E >046F # CYRILLIC CAPITAL LETTER KSI
+# 046F valid # CYRILLIC SMALL LETTER KSI
+0470 >0471 # CYRILLIC CAPITAL LETTER PSI
+# 0471 valid # CYRILLIC SMALL LETTER PSI
+0472 >0473 # CYRILLIC CAPITAL LETTER FITA
+# 0473 valid # CYRILLIC SMALL LETTER FITA
+0474 >0475 # CYRILLIC CAPITAL LETTER IZHITSA
+# 0475 valid # CYRILLIC SMALL LETTER IZHITSA
+0476 >0477 # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT
+# 0477 valid # CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT
+0478 >0479 # CYRILLIC CAPITAL LETTER UK
+# 0479 valid # CYRILLIC SMALL LETTER UK
+047A >047B # CYRILLIC CAPITAL LETTER ROUND OMEGA
+# 047B valid # CYRILLIC SMALL LETTER ROUND OMEGA
+047C >047D # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO
+# 047D valid # CYRILLIC SMALL LETTER OMEGA WITH TITLO
+047E >047F # CYRILLIC CAPITAL LETTER OT
+# 047F valid # CYRILLIC SMALL LETTER OT
+0480 >0481 # CYRILLIC CAPITAL LETTER KOPPA
+# 0481..0489 valid # CYRILLIC SMALL LETTER KOPPA..COMBINING CYRILLIC MILLIONS SIGN
+048A >048B # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL
+# 048B valid # CYRILLIC SMALL LETTER SHORT I WITH TAIL
+048C >048D # CYRILLIC CAPITAL LETTER SEMISOFT SIGN
+# 048D valid # CYRILLIC SMALL LETTER SEMISOFT SIGN
+048E >048F # CYRILLIC CAPITAL LETTER ER WITH TICK
+# 048F valid # CYRILLIC SMALL LETTER ER WITH TICK
+0490 >0491 # CYRILLIC CAPITAL LETTER GHE WITH UPTURN
+# 0491 valid # CYRILLIC SMALL LETTER GHE WITH UPTURN
+0492 >0493 # CYRILLIC CAPITAL LETTER GHE WITH STROKE
+# 0493 valid # CYRILLIC SMALL LETTER GHE WITH STROKE
+0494 >0495 # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK
+# 0495 valid # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK
+0496 >0497 # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
+# 0497 valid # CYRILLIC SMALL LETTER ZHE WITH DESCENDER
+0498 >0499 # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER
+# 0499 valid # CYRILLIC SMALL LETTER ZE WITH DESCENDER
+049A >049B # CYRILLIC CAPITAL LETTER KA WITH DESCENDER
+# 049B valid # CYRILLIC SMALL LETTER KA WITH DESCENDER
+049C >049D # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE
+# 049D valid # CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE
+049E >049F # CYRILLIC CAPITAL LETTER KA WITH STROKE
+# 049F valid # CYRILLIC SMALL LETTER KA WITH STROKE
+04A0 >04A1 # CYRILLIC CAPITAL LETTER BASHKIR KA
+# 04A1 valid # CYRILLIC SMALL LETTER BASHKIR KA
+04A2 >04A3 # CYRILLIC CAPITAL LETTER EN WITH DESCENDER
+# 04A3 valid # CYRILLIC SMALL LETTER EN WITH DESCENDER
+04A4 >04A5 # CYRILLIC CAPITAL LIGATURE EN GHE
+# 04A5 valid # CYRILLIC SMALL LIGATURE EN GHE
+04A6 >04A7 # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK
+# 04A7 valid # CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK
+04A8 >04A9 # CYRILLIC CAPITAL LETTER ABKHASIAN HA
+# 04A9 valid # CYRILLIC SMALL LETTER ABKHASIAN HA
+04AA >04AB # CYRILLIC CAPITAL LETTER ES WITH DESCENDER
+# 04AB valid # CYRILLIC SMALL LETTER ES WITH DESCENDER
+04AC >04AD # CYRILLIC CAPITAL LETTER TE WITH DESCENDER
+# 04AD valid # CYRILLIC SMALL LETTER TE WITH DESCENDER
+04AE >04AF # CYRILLIC CAPITAL LETTER STRAIGHT U
+# 04AF valid # CYRILLIC SMALL LETTER STRAIGHT U
+04B0 >04B1 # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE
+# 04B1 valid # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE
+04B2 >04B3 # CYRILLIC CAPITAL LETTER HA WITH DESCENDER
+# 04B3 valid # CYRILLIC SMALL LETTER HA WITH DESCENDER
+04B4 >04B5 # CYRILLIC CAPITAL LIGATURE TE TSE
+# 04B5 valid # CYRILLIC SMALL LIGATURE TE TSE
+04B6 >04B7 # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER
+# 04B7 valid # CYRILLIC SMALL LETTER CHE WITH DESCENDER
+04B8 >04B9 # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE
+# 04B9 valid # CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE
+04BA >04BB # CYRILLIC CAPITAL LETTER SHHA
+# 04BB valid # CYRILLIC SMALL LETTER SHHA
+04BC >04BD # CYRILLIC CAPITAL LETTER ABKHASIAN CHE
+# 04BD valid # CYRILLIC SMALL LETTER ABKHASIAN CHE
+04BE >04BF # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER
+# 04BF valid # CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER
+04C0 >FFFD # CYRILLIC LETTER PALOCHKA
+04C1 >04C2 # CYRILLIC CAPITAL LETTER ZHE WITH BREVE
+# 04C2 valid # CYRILLIC SMALL LETTER ZHE WITH BREVE
+04C3 >04C4 # CYRILLIC CAPITAL LETTER KA WITH HOOK
+# 04C4 valid # CYRILLIC SMALL LETTER KA WITH HOOK
+04C5 >04C6 # CYRILLIC CAPITAL LETTER EL WITH TAIL
+# 04C6 valid # CYRILLIC SMALL LETTER EL WITH TAIL
+04C7 >04C8 # CYRILLIC CAPITAL LETTER EN WITH HOOK
+# 04C8 valid # CYRILLIC SMALL LETTER EN WITH HOOK
+04C9 >04CA # CYRILLIC CAPITAL LETTER EN WITH TAIL
+# 04CA valid # CYRILLIC SMALL LETTER EN WITH TAIL
+04CB >04CC # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE
+# 04CC valid # CYRILLIC SMALL LETTER KHAKASSIAN CHE
+04CD >04CE # CYRILLIC CAPITAL LETTER EM WITH TAIL
+# 04CE..04CF valid # CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC SMALL LETTER PALOCHKA
+04D0 >04D1 # CYRILLIC CAPITAL LETTER A WITH BREVE
+# 04D1 valid # CYRILLIC SMALL LETTER A WITH BREVE
+04D2 >04D3 # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
+# 04D3 valid # CYRILLIC SMALL LETTER A WITH DIAERESIS
+04D4 >04D5 # CYRILLIC CAPITAL LIGATURE A IE
+# 04D5 valid # CYRILLIC SMALL LIGATURE A IE
+04D6 >04D7 # CYRILLIC CAPITAL LETTER IE WITH BREVE
+# 04D7 valid # CYRILLIC SMALL LETTER IE WITH BREVE
+04D8 >04D9 # CYRILLIC CAPITAL LETTER SCHWA
+# 04D9 valid # CYRILLIC SMALL LETTER SCHWA
+04DA >04DB # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS
+# 04DB valid # CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS
+04DC >04DD # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS
+# 04DD valid # CYRILLIC SMALL LETTER ZHE WITH DIAERESIS
+04DE >04DF # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS
+# 04DF valid # CYRILLIC SMALL LETTER ZE WITH DIAERESIS
+04E0 >04E1 # CYRILLIC CAPITAL LETTER ABKHASIAN DZE
+# 04E1 valid # CYRILLIC SMALL LETTER ABKHASIAN DZE
+04E2 >04E3 # CYRILLIC CAPITAL LETTER I WITH MACRON
+# 04E3 valid # CYRILLIC SMALL LETTER I WITH MACRON
+04E4 >04E5 # CYRILLIC CAPITAL LETTER I WITH DIAERESIS
+# 04E5 valid # CYRILLIC SMALL LETTER I WITH DIAERESIS
+04E6 >04E7 # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
+# 04E7 valid # CYRILLIC SMALL LETTER O WITH DIAERESIS
+04E8 >04E9 # CYRILLIC CAPITAL LETTER BARRED O
+# 04E9 valid # CYRILLIC SMALL LETTER BARRED O
+04EA >04EB # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS
+# 04EB valid # CYRILLIC SMALL LETTER BARRED O WITH DIAERESIS
+04EC >04ED # CYRILLIC CAPITAL LETTER E WITH DIAERESIS
+# 04ED valid # CYRILLIC SMALL LETTER E WITH DIAERESIS
+04EE >04EF # CYRILLIC CAPITAL LETTER U WITH MACRON
+# 04EF valid # CYRILLIC SMALL LETTER U WITH MACRON
+04F0 >04F1 # CYRILLIC CAPITAL LETTER U WITH DIAERESIS
+# 04F1 valid # CYRILLIC SMALL LETTER U WITH DIAERESIS
+04F2 >04F3 # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
+# 04F3 valid # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE
+04F4 >04F5 # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS
+# 04F5 valid # CYRILLIC SMALL LETTER CHE WITH DIAERESIS
+04F6 >04F7 # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER
+# 04F7 valid # CYRILLIC SMALL LETTER GHE WITH DESCENDER
+04F8 >04F9 # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS
+# 04F9 valid # CYRILLIC SMALL LETTER YERU WITH DIAERESIS
+04FA >04FB # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK
+# 04FB valid # CYRILLIC SMALL LETTER GHE WITH STROKE AND HOOK
+04FC >04FD # CYRILLIC CAPITAL LETTER HA WITH HOOK
+# 04FD valid # CYRILLIC SMALL LETTER HA WITH HOOK
+04FE >04FF # CYRILLIC CAPITAL LETTER HA WITH STROKE
+# 04FF valid # CYRILLIC SMALL LETTER HA WITH STROKE
+0500 >0501 # CYRILLIC CAPITAL LETTER KOMI DE
+# 0501 valid # CYRILLIC SMALL LETTER KOMI DE
+0502 >0503 # CYRILLIC CAPITAL LETTER KOMI DJE
+# 0503 valid # CYRILLIC SMALL LETTER KOMI DJE
+0504 >0505 # CYRILLIC CAPITAL LETTER KOMI ZJE
+# 0505 valid # CYRILLIC SMALL LETTER KOMI ZJE
+0506 >0507 # CYRILLIC CAPITAL LETTER KOMI DZJE
+# 0507 valid # CYRILLIC SMALL LETTER KOMI DZJE
+0508 >0509 # CYRILLIC CAPITAL LETTER KOMI LJE
+# 0509 valid # CYRILLIC SMALL LETTER KOMI LJE
+050A >050B # CYRILLIC CAPITAL LETTER KOMI NJE
+# 050B valid # CYRILLIC SMALL LETTER KOMI NJE
+050C >050D # CYRILLIC CAPITAL LETTER KOMI SJE
+# 050D valid # CYRILLIC SMALL LETTER KOMI SJE
+050E >050F # CYRILLIC CAPITAL LETTER KOMI TJE
+# 050F valid # CYRILLIC SMALL LETTER KOMI TJE
+0510 >0511 # CYRILLIC CAPITAL LETTER REVERSED ZE
+# 0511 valid # CYRILLIC SMALL LETTER REVERSED ZE
+0512 >0513 # CYRILLIC CAPITAL LETTER EL WITH HOOK
+# 0513 valid # CYRILLIC SMALL LETTER EL WITH HOOK
+0514 >0515 # CYRILLIC CAPITAL LETTER LHA
+# 0515 valid # CYRILLIC SMALL LETTER LHA
+0516 >0517 # CYRILLIC CAPITAL LETTER RHA
+# 0517 valid # CYRILLIC SMALL LETTER RHA
+0518 >0519 # CYRILLIC CAPITAL LETTER YAE
+# 0519 valid # CYRILLIC SMALL LETTER YAE
+051A >051B # CYRILLIC CAPITAL LETTER QA
+# 051B valid # CYRILLIC SMALL LETTER QA
+051C >051D # CYRILLIC CAPITAL LETTER WE
+# 051D valid # CYRILLIC SMALL LETTER WE
+051E >051F # CYRILLIC CAPITAL LETTER ALEUT KA
+# 051F valid # CYRILLIC SMALL LETTER ALEUT KA
+0520 >0521 # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK
+# 0521 valid # CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK
+0522 >0523 # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK
+# 0523 valid # CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK
+0524 >0525 # CYRILLIC CAPITAL LETTER PE WITH DESCENDER
+# 0525 valid # CYRILLIC SMALL LETTER PE WITH DESCENDER
+0526..0530 >FFFD # ..
+0531 >0561 # ARMENIAN CAPITAL LETTER AYB
+0532 >0562 # ARMENIAN CAPITAL LETTER BEN
+0533 >0563 # ARMENIAN CAPITAL LETTER GIM
+0534 >0564 # ARMENIAN CAPITAL LETTER DA
+0535 >0565 # ARMENIAN CAPITAL LETTER ECH
+0536 >0566 # ARMENIAN CAPITAL LETTER ZA
+0537 >0567 # ARMENIAN CAPITAL LETTER EH
+0538 >0568 # ARMENIAN CAPITAL LETTER ET
+0539 >0569 # ARMENIAN CAPITAL LETTER TO
+053A >056A # ARMENIAN CAPITAL LETTER ZHE
+053B >056B # ARMENIAN CAPITAL LETTER INI
+053C >056C # ARMENIAN CAPITAL LETTER LIWN
+053D >056D # ARMENIAN CAPITAL LETTER XEH
+053E >056E # ARMENIAN CAPITAL LETTER CA
+053F >056F # ARMENIAN CAPITAL LETTER KEN
+0540 >0570 # ARMENIAN CAPITAL LETTER HO
+0541 >0571 # ARMENIAN CAPITAL LETTER JA
+0542 >0572 # ARMENIAN CAPITAL LETTER GHAD
+0543 >0573 # ARMENIAN CAPITAL LETTER CHEH
+0544 >0574 # ARMENIAN CAPITAL LETTER MEN
+0545 >0575 # ARMENIAN CAPITAL LETTER YI
+0546 >0576 # ARMENIAN CAPITAL LETTER NOW
+0547 >0577 # ARMENIAN CAPITAL LETTER SHA
+0548 >0578 # ARMENIAN CAPITAL LETTER VO
+0549 >0579 # ARMENIAN CAPITAL LETTER CHA
+054A >057A # ARMENIAN CAPITAL LETTER PEH
+054B >057B # ARMENIAN CAPITAL LETTER JHEH
+054C >057C # ARMENIAN CAPITAL LETTER RA
+054D >057D # ARMENIAN CAPITAL LETTER SEH
+054E >057E # ARMENIAN CAPITAL LETTER VEW
+054F >057F # ARMENIAN CAPITAL LETTER TIWN
+0550 >0580 # ARMENIAN CAPITAL LETTER REH
+0551 >0581 # ARMENIAN CAPITAL LETTER CO
+0552 >0582 # ARMENIAN CAPITAL LETTER YIWN
+0553 >0583 # ARMENIAN CAPITAL LETTER PIWR
+0554 >0584 # ARMENIAN CAPITAL LETTER KEH
+0555 >0585 # ARMENIAN CAPITAL LETTER OH
+0556 >0586 # ARMENIAN CAPITAL LETTER FEH
+0557..0558 >FFFD # ..
+# 0559..055F valid # ARMENIAN MODIFIER LETTER LEFT HALF RING..ARMENIAN ABBREVIATION MARK
+0560 >FFFD #
+# 0561..0586 valid # ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LETTER FEH
+0587 >0565 0582 # ARMENIAN SMALL LIGATURE ECH YIWN
+0588 >FFFD #
+# 0589..058A valid # ARMENIAN FULL STOP..ARMENIAN HYPHEN
+058B..0590 >FFFD # ..
+# 0591..05C7 valid # HEBREW ACCENT ETNAHTA..HEBREW POINT QAMATS QATAN
+05C8..05CF >FFFD # ..
+# 05D0..05EA valid # HEBREW LETTER ALEF..HEBREW LETTER TAV
+05EB..05EF >FFFD # ..
+# 05F0..05F4 valid # HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW PUNCTUATION GERSHAYIM
+05F5..05FF >FFFD # ..
+0600..0603 >FFFD # ARABIC NUMBER SIGN..ARABIC SIGN SAFHA
+0604..0605 >FFFD # ..
+# 0606..061B valid # ARABIC-INDIC CUBE ROOT..ARABIC SEMICOLON
+061C..061D >FFFD # ..
+# 061E..061F valid # ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC QUESTION MARK
+0620 >FFFD #
+# 0621..065E valid # ARABIC LETTER HAMZA..ARABIC FATHA WITH TWO DOTS
+065F >FFFD #
+# 0660..0674 valid # ARABIC-INDIC DIGIT ZERO..ARABIC LETTER HIGH HAMZA
+0675 >0627 0674 # ARABIC LETTER HIGH HAMZA ALEF
+0676 >0648 0674 # ARABIC LETTER HIGH HAMZA WAW
+0677 >06C7 0674 # ARABIC LETTER U WITH HAMZA ABOVE
+0678 >064A 0674 # ARABIC LETTER HIGH HAMZA YEH
+# 0679..06DC valid # ARABIC LETTER TTEH..ARABIC SMALL HIGH SEEN
+06DD >FFFD # ARABIC END OF AYAH
+# 06DE..070D valid # ARABIC START OF RUB EL HIZB..SYRIAC HARKLEAN ASTERISCUS
+070E >FFFD #
+070F >FFFD # SYRIAC ABBREVIATION MARK
+# 0710..074A valid # SYRIAC LETTER ALAPH..SYRIAC BARREKH
+074B..074C >FFFD # ..
+# 074D..07B1 valid # SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER NAA
+07B2..07BF >FFFD # ..
+# 07C0..07FA valid # NKO DIGIT ZERO..NKO LAJANYALAN
+07FB..07FF >FFFD # ..
+# 0800..082D valid # SAMARITAN LETTER ALAF..SAMARITAN MARK NEQUDAA
+082E..082F >FFFD # ..
+# 0830..083E valid # SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU
+083F..08FF >FFFD # ..
+# 0900..0939 valid # DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI LETTER HA
+093A..093B >FFFD # ..
+# 093C..094E valid # DEVANAGARI SIGN NUKTA..DEVANAGARI VOWEL SIGN PRISHTHAMATRA E
+094F >FFFD #
+# 0950..0955 valid # DEVANAGARI OM..DEVANAGARI VOWEL SIGN CANDRA LONG E
+0956..0957 >FFFD # ..
+0958 >0915 093C # DEVANAGARI LETTER QA
+0959 >0916 093C # DEVANAGARI LETTER KHHA
+095A >0917 093C # DEVANAGARI LETTER GHHA
+095B >091C 093C # DEVANAGARI LETTER ZA
+095C >0921 093C # DEVANAGARI LETTER DDDHA
+095D >0922 093C # DEVANAGARI LETTER RHA
+095E >092B 093C # DEVANAGARI LETTER FA
+095F >092F 093C # DEVANAGARI LETTER YYA
+# 0960..0972 valid # DEVANAGARI LETTER VOCALIC RR..DEVANAGARI LETTER CANDRA A
+0973..0978 >FFFD # ..
+# 0979..097F valid # DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA
+0980 >FFFD #
+# 0981..0983 valid # BENGALI SIGN CANDRABINDU..BENGALI SIGN VISARGA
+0984 >FFFD #
+# 0985..098C valid # BENGALI LETTER A..BENGALI LETTER VOCALIC L
+098D..098E >FFFD # ..
+# 098F..0990 valid # BENGALI LETTER E..BENGALI LETTER AI
+0991..0992 >FFFD # ..
+# 0993..09A8 valid # BENGALI LETTER O..BENGALI LETTER NA
+09A9 >FFFD #
+# 09AA..09B0 valid # BENGALI LETTER PA..BENGALI LETTER RA
+09B1 >FFFD #
+# 09B2 valid # BENGALI LETTER LA
+09B3..09B5 >FFFD # ..
+# 09B6..09B9 valid # BENGALI LETTER SHA..BENGALI LETTER HA
+09BA..09BB >FFFD # ..
+# 09BC..09C4 valid # BENGALI SIGN NUKTA..BENGALI VOWEL SIGN VOCALIC RR
+09C5..09C6 >FFFD # ..
+# 09C7..09C8 valid # BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI
+09C9..09CA >FFFD # ..
+# 09CB..09CE valid # BENGALI VOWEL SIGN O..BENGALI LETTER KHANDA TA
+09CF..09D6 >FFFD # ..
+# 09D7 valid # BENGALI AU LENGTH MARK
+09D8..09DB >FFFD # ..
+09DC >09A1 09BC # BENGALI LETTER RRA
+09DD >09A2 09BC # BENGALI LETTER RHA
+09DE >FFFD #
+09DF >09AF 09BC # BENGALI LETTER YYA
+# 09E0..09E3 valid # BENGALI LETTER VOCALIC RR..BENGALI VOWEL SIGN VOCALIC LL
+09E4..09E5 >FFFD # ..
+# 09E6..09FB valid # BENGALI DIGIT ZERO..BENGALI GANDA MARK
+09FC..0A00 >FFFD # ..
+# 0A01..0A03 valid # GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN VISARGA
+0A04 >FFFD #
+# 0A05..0A0A valid # GURMUKHI LETTER A..GURMUKHI LETTER UU
+0A0B..0A0E >FFFD # ..
+# 0A0F..0A10 valid # GURMUKHI LETTER EE..GURMUKHI LETTER AI
+0A11..0A12 >FFFD # ..
+# 0A13..0A28 valid # GURMUKHI LETTER OO..GURMUKHI LETTER NA
+0A29 >FFFD #
+# 0A2A..0A30 valid # GURMUKHI LETTER PA..GURMUKHI LETTER RA
+0A31 >FFFD #
+# 0A32 valid # GURMUKHI LETTER LA
+0A33 >0A32 0A3C # GURMUKHI LETTER LLA
+0A34 >FFFD #
+# 0A35 valid # GURMUKHI LETTER VA
+0A36 >0A38 0A3C # GURMUKHI LETTER SHA
+0A37 >FFFD #
+# 0A38..0A39 valid # GURMUKHI LETTER SA..GURMUKHI LETTER HA
+0A3A..0A3B >FFFD # ..
+# 0A3C valid # GURMUKHI SIGN NUKTA
+0A3D >FFFD #
+# 0A3E..0A42 valid # GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN UU
+0A43..0A46 >FFFD # ..
+# 0A47..0A48 valid # GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI
+0A49..0A4A >FFFD # ..
+# 0A4B..0A4D valid # GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA
+0A4E..0A50 >FFFD # ..
+# 0A51 valid # GURMUKHI SIGN UDAAT
+0A52..0A58 >FFFD # ..
+0A59 >0A16 0A3C # GURMUKHI LETTER KHHA
+0A5A >0A17 0A3C # GURMUKHI LETTER GHHA
+0A5B >0A1C 0A3C # GURMUKHI LETTER ZA
+# 0A5C valid # GURMUKHI LETTER RRA
+0A5D >FFFD #
+0A5E >0A2B 0A3C # GURMUKHI LETTER FA
+0A5F..0A65 >FFFD # ..
+# 0A66..0A75 valid # GURMUKHI DIGIT ZERO..GURMUKHI SIGN YAKASH
+0A76..0A80 >FFFD # ..
+# 0A81..0A83 valid # GUJARATI SIGN CANDRABINDU..GUJARATI SIGN VISARGA
+0A84 >FFFD #
+# 0A85..0A8D valid # GUJARATI LETTER A..GUJARATI VOWEL CANDRA E
+0A8E >FFFD #
+# 0A8F..0A91 valid # GUJARATI LETTER E..GUJARATI VOWEL CANDRA O
+0A92 >FFFD #
+# 0A93..0AA8 valid # GUJARATI LETTER O..GUJARATI LETTER NA
+0AA9 >FFFD #
+# 0AAA..0AB0 valid # GUJARATI LETTER PA..GUJARATI LETTER RA
+0AB1 >FFFD #
+# 0AB2..0AB3 valid # GUJARATI LETTER LA..GUJARATI LETTER LLA
+0AB4 >FFFD #
+# 0AB5..0AB9 valid # GUJARATI LETTER VA..GUJARATI LETTER HA
+0ABA..0ABB >FFFD # ..
+# 0ABC..0AC5 valid # GUJARATI SIGN NUKTA..GUJARATI VOWEL SIGN CANDRA E
+0AC6 >FFFD #
+# 0AC7..0AC9 valid # GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN CANDRA O
+0ACA >FFFD #
+# 0ACB..0ACD valid # GUJARATI VOWEL SIGN O..GUJARATI SIGN VIRAMA
+0ACE..0ACF >FFFD # ..
+# 0AD0 valid # GUJARATI OM
+0AD1..0ADF >FFFD # ..
+# 0AE0..0AE3 valid # GUJARATI LETTER VOCALIC RR..GUJARATI VOWEL SIGN VOCALIC LL
+0AE4..0AE5 >FFFD # ..
+# 0AE6..0AEF valid # GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
+0AF0 >FFFD #
+# 0AF1 valid # GUJARATI RUPEE SIGN
+0AF2..0B00 >FFFD # ..
+# 0B01..0B03 valid # ORIYA SIGN CANDRABINDU..ORIYA SIGN VISARGA
+0B04 >FFFD #
+# 0B05..0B0C valid # ORIYA LETTER A..ORIYA LETTER VOCALIC L
+0B0D..0B0E >FFFD # ..
+# 0B0F..0B10 valid # ORIYA LETTER E..ORIYA LETTER AI
+0B11..0B12 >FFFD # ..
+# 0B13..0B28 valid # ORIYA LETTER O..ORIYA LETTER NA
+0B29 >FFFD #
+# 0B2A..0B30 valid # ORIYA LETTER PA..ORIYA LETTER RA
+0B31 >FFFD #
+# 0B32..0B33 valid # ORIYA LETTER LA..ORIYA LETTER LLA
+0B34 >FFFD #
+# 0B35..0B39 valid # ORIYA LETTER VA..ORIYA LETTER HA
+0B3A..0B3B >FFFD # ..
+# 0B3C..0B44 valid # ORIYA SIGN NUKTA..ORIYA VOWEL SIGN VOCALIC RR
+0B45..0B46 >FFFD # ..
+# 0B47..0B48 valid # ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI
+0B49..0B4A >FFFD # ..
+# 0B4B..0B4D valid # ORIYA VOWEL SIGN O..ORIYA SIGN VIRAMA
+0B4E..0B55 >FFFD # ..
+# 0B56..0B57 valid # ORIYA AI LENGTH MARK..ORIYA AU LENGTH MARK
+0B58..0B5B >FFFD # ..
+0B5C >0B21 0B3C # ORIYA LETTER RRA
+0B5D >0B22 0B3C # ORIYA LETTER RHA
+0B5E >FFFD #
+# 0B5F..0B63 valid # ORIYA LETTER YYA..ORIYA VOWEL SIGN VOCALIC LL
+0B64..0B65 >FFFD # ..
+# 0B66..0B71 valid # ORIYA DIGIT ZERO..ORIYA LETTER WA
+0B72..0B81 >FFFD # ..
+# 0B82..0B83 valid # TAMIL SIGN ANUSVARA..TAMIL SIGN VISARGA
+0B84 >FFFD #
+# 0B85..0B8A valid # TAMIL LETTER A..TAMIL LETTER UU
+0B8B..0B8D >FFFD # ..
+# 0B8E..0B90 valid # TAMIL LETTER E..TAMIL LETTER AI
+0B91 >FFFD #
+# 0B92..0B95 valid # TAMIL LETTER O..TAMIL LETTER KA
+0B96..0B98 >FFFD # ..
+# 0B99..0B9A valid # TAMIL LETTER NGA..TAMIL LETTER CA
+0B9B >FFFD #
+# 0B9C valid # TAMIL LETTER JA
+0B9D >FFFD #
+# 0B9E..0B9F valid # TAMIL LETTER NYA..TAMIL LETTER TTA
+0BA0..0BA2 >FFFD # ..
+# 0BA3..0BA4 valid # TAMIL LETTER NNA..TAMIL LETTER TA
+0BA5..0BA7 >FFFD # ..
+# 0BA8..0BAA valid # TAMIL LETTER NA..TAMIL LETTER PA
+0BAB..0BAD >FFFD # ..
+# 0BAE..0BB9 valid # TAMIL LETTER MA..TAMIL LETTER HA
+0BBA..0BBD >FFFD # ..
+# 0BBE..0BC2 valid # TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN UU
+0BC3..0BC5 >FFFD # ..
+# 0BC6..0BC8 valid # TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI
+0BC9 >FFFD #
+# 0BCA..0BCD valid # TAMIL VOWEL SIGN O..TAMIL SIGN VIRAMA
+0BCE..0BCF >FFFD # ..
+# 0BD0 valid # TAMIL OM
+0BD1..0BD6 >FFFD # ..
+# 0BD7 valid # TAMIL AU LENGTH MARK
+0BD8..0BE5 >FFFD # ..
+# 0BE6..0BFA valid # TAMIL DIGIT ZERO..TAMIL NUMBER SIGN
+0BFB..0C00 >FFFD # ..
+# 0C01..0C03 valid # TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
+0C04 >FFFD #
+# 0C05..0C0C valid # TELUGU LETTER A..TELUGU LETTER VOCALIC L
+0C0D >FFFD #
+# 0C0E..0C10 valid # TELUGU LETTER E..TELUGU LETTER AI
+0C11 >FFFD #
+# 0C12..0C28 valid # TELUGU LETTER O..TELUGU LETTER NA
+0C29 >FFFD #
+# 0C2A..0C33 valid # TELUGU LETTER PA..TELUGU LETTER LLA
+0C34 >FFFD #
+# 0C35..0C39 valid # TELUGU LETTER VA..TELUGU LETTER HA
+0C3A..0C3C >FFFD # ..
+# 0C3D..0C44 valid # TELUGU SIGN AVAGRAHA..TELUGU VOWEL SIGN VOCALIC RR
+0C45 >FFFD #
+# 0C46..0C48 valid # TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
+0C49 >FFFD #
+# 0C4A..0C4D valid # TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
+0C4E..0C54 >FFFD # ..
+# 0C55..0C56 valid # TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
+0C57 >FFFD #
+# 0C58..0C59 valid # TELUGU LETTER TSA..TELUGU LETTER DZA
+0C5A..0C5F >FFFD # ..
+# 0C60..0C63 valid # TELUGU LETTER VOCALIC RR..TELUGU VOWEL SIGN VOCALIC LL
+0C64..0C65 >FFFD # ..
+# 0C66..0C6F valid # TELUGU DIGIT ZERO..TELUGU DIGIT NINE
+0C70..0C77 >FFFD # ..
+# 0C78..0C7F valid # TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU SIGN TUUMU
+0C80..0C81 >FFFD # ..
+# 0C82..0C83 valid # KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
+0C84 >FFFD #
+# 0C85..0C8C valid # KANNADA LETTER A..KANNADA LETTER VOCALIC L
+0C8D >FFFD #
+# 0C8E..0C90 valid # KANNADA LETTER E..KANNADA LETTER AI
+0C91 >FFFD #
+# 0C92..0CA8 valid # KANNADA LETTER O..KANNADA LETTER NA
+0CA9 >FFFD #
+# 0CAA..0CB3 valid # KANNADA LETTER PA..KANNADA LETTER LLA
+0CB4 >FFFD #
+# 0CB5..0CB9 valid # KANNADA LETTER VA..KANNADA LETTER HA
+0CBA..0CBB >FFFD # ..
+# 0CBC..0CC4 valid # KANNADA SIGN NUKTA..KANNADA VOWEL SIGN VOCALIC RR
+0CC5 >FFFD #
+# 0CC6..0CC8 valid # KANNADA VOWEL SIGN E..KANNADA VOWEL SIGN AI
+0CC9 >FFFD #
+# 0CCA..0CCD valid # KANNADA VOWEL SIGN O..KANNADA SIGN VIRAMA
+0CCE..0CD4 >FFFD # ..
+# 0CD5..0CD6 valid # KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
+0CD7..0CDD >FFFD # ..
+# 0CDE valid # KANNADA LETTER FA
+0CDF >FFFD #
+# 0CE0..0CE3 valid # KANNADA LETTER VOCALIC RR..KANNADA VOWEL SIGN VOCALIC LL
+0CE4..0CE5 >FFFD # ..
+# 0CE6..0CEF valid # KANNADA DIGIT ZERO..KANNADA DIGIT NINE
+0CF0 >FFFD #
+# 0CF1..0CF2 valid # KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
+0CF3..0D01 >FFFD # ..
+# 0D02..0D03 valid # MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
+0D04 >FFFD #
+# 0D05..0D0C valid # MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L
+0D0D >FFFD #
+# 0D0E..0D10 valid # MALAYALAM LETTER E..MALAYALAM LETTER AI
+0D11 >FFFD #
+# 0D12..0D28 valid # MALAYALAM LETTER O..MALAYALAM LETTER NA
+0D29 >FFFD #
+# 0D2A..0D39 valid # MALAYALAM LETTER PA..MALAYALAM LETTER HA
+0D3A..0D3C >FFFD # ..
+# 0D3D..0D44 valid # MALAYALAM SIGN AVAGRAHA..MALAYALAM VOWEL SIGN VOCALIC RR
+0D45 >FFFD #
+# 0D46..0D48 valid # MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI
+0D49 >FFFD #
+# 0D4A..0D4D valid # MALAYALAM VOWEL SIGN O..MALAYALAM SIGN VIRAMA
+0D4E..0D56 >FFFD # ..
+# 0D57 valid # MALAYALAM AU LENGTH MARK
+0D58..0D5F >FFFD # ..
+# 0D60..0D63 valid # MALAYALAM LETTER VOCALIC RR..MALAYALAM VOWEL SIGN VOCALIC LL
+0D64..0D65 >FFFD # ..
+# 0D66..0D75 valid # MALAYALAM DIGIT ZERO..MALAYALAM FRACTION THREE QUARTERS
+0D76..0D78 >FFFD # ..
+# 0D79..0D7F valid # MALAYALAM DATE MARK..MALAYALAM LETTER CHILLU K
+0D80..0D81 >FFFD # ..
+# 0D82..0D83 valid # SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA
+0D84 >FFFD #
+# 0D85..0D96 valid # SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA
+0D97..0D99 >FFFD # ..
+# 0D9A..0DB1 valid # SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA
+0DB2 >FFFD #
+# 0DB3..0DBB valid # SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA
+0DBC >FFFD #
+# 0DBD valid # SINHALA LETTER DANTAJA LAYANNA
+0DBE..0DBF >FFFD # ..
+# 0DC0..0DC6 valid # SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA
+0DC7..0DC9 >FFFD # ..
+# 0DCA valid # SINHALA SIGN AL-LAKUNA
+0DCB..0DCE >FFFD # ..
+# 0DCF..0DD4 valid # SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
+0DD5 >FFFD #