ICU-8615 implement optional IDNA2008 CONTEXTO check in UTS46
X-SVN-Rev: 30268
This commit is contained in:
parent
3044b39615
commit
b6036a94f9
@ -1,6 +1,6 @@
|
||||
/*
|
||||
********************************************************************************
|
||||
* Copyright (C) 1996-2010, International Business Machines
|
||||
* Copyright (C) 1996-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************************
|
||||
*
|
||||
@ -23,6 +23,7 @@
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "uassert.h"
|
||||
#include "umutex.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucln_cmn.h"
|
||||
@ -475,7 +476,7 @@ u_forDigit(int32_t digit, int8_t radix) {
|
||||
}
|
||||
}
|
||||
|
||||
/* miscellaneous, and support for uprops.c ---------------------------------- */
|
||||
/* miscellaneous, and support for uprops.cpp -------------------------------- */
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
u_getUnicodeVersion(UVersionInfo versionArray) {
|
||||
@ -485,19 +486,19 @@ u_getUnicodeVersion(UVersionInfo versionArray) {
|
||||
}
|
||||
|
||||
U_CFUNC uint32_t
|
||||
u_getUnicodeProperties(UChar32 c, int32_t column) {
|
||||
uint16_t vecIndex;
|
||||
u_getMainProperties(UChar32 c) {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return props;
|
||||
}
|
||||
|
||||
if(column==-1) {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return props;
|
||||
} else if(
|
||||
column<0 || column>=propsVectorsColumns
|
||||
) {
|
||||
U_CFUNC uint32_t
|
||||
u_getUnicodeProperties(UChar32 c, int32_t column) {
|
||||
U_ASSERT(column>=0);
|
||||
if(column>=propsVectorsColumns) {
|
||||
return 0;
|
||||
} else {
|
||||
vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
|
||||
uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
|
||||
return propsVectors[vecIndex+column];
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2010, International Business Machines
|
||||
* Copyright (C) 2003-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -42,12 +42,14 @@
|
||||
enum {
|
||||
/**
|
||||
* Default options value: None of the other options are set.
|
||||
* For use in static worker and factory methods.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
UIDNA_DEFAULT=0,
|
||||
/**
|
||||
* Option to allow unassigned code points in domain names and labels.
|
||||
* This option is ignored by the UTS46 implementation.
|
||||
* For use in static worker and factory methods.
|
||||
* <p>This option is ignored by the UTS46 implementation.
|
||||
* (UTS #46 disallows unassigned code points.)
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
@ -56,39 +58,54 @@ enum {
|
||||
* Option to check whether the input conforms to the STD3 ASCII rules,
|
||||
* for example the restriction of labels to LDH characters
|
||||
* (ASCII Letters, Digits and Hyphen-Minus).
|
||||
* For use in static worker and factory methods.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
UIDNA_USE_STD3_RULES=2,
|
||||
/**
|
||||
* IDNA option to check for whether the input conforms to the BiDi rules.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* For use in static worker and factory methods.
|
||||
* <p>This option is ignored by the IDNA2003 implementation.
|
||||
* (IDNA2003 always performs a BiDi check.)
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
UIDNA_CHECK_BIDI=4,
|
||||
/**
|
||||
* IDNA option to check for whether the input conforms to the CONTEXTJ rules.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* For use in static worker and factory methods.
|
||||
* <p>This option is ignored by the IDNA2003 implementation.
|
||||
* (The CONTEXTJ check is new in IDNA2008.)
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
UIDNA_CHECK_CONTEXTJ=8,
|
||||
/**
|
||||
* IDNA option for nontransitional processing in ToASCII().
|
||||
* By default, ToASCII() uses transitional processing.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* For use in static worker and factory methods.
|
||||
* <p>By default, ToASCII() uses transitional processing.
|
||||
* <p>This option is ignored by the IDNA2003 implementation.
|
||||
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
UIDNA_NONTRANSITIONAL_TO_ASCII=0x10,
|
||||
/**
|
||||
* IDNA option for nontransitional processing in ToUnicode().
|
||||
* By default, ToUnicode() uses transitional processing.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* For use in static worker and factory methods.
|
||||
* <p>By default, ToUnicode() uses transitional processing.
|
||||
* <p>This option is ignored by the IDNA2003 implementation.
|
||||
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20
|
||||
UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20,
|
||||
/**
|
||||
* IDNA option to check for whether the input conforms to the CONTEXTO rules.
|
||||
* For use in static worker and factory methods.
|
||||
* <p>This option is ignored by the IDNA2003 implementation.
|
||||
* (The CONTEXTO check is new in IDNA2008.)
|
||||
* <p>This is for use by registries for IDNA2008 conformance.
|
||||
* UTS #46 does not require the CONTEXTO check.
|
||||
* @draft ICU 49
|
||||
*/
|
||||
UIDNA_CHECK_CONTEXTO=0x40
|
||||
};
|
||||
|
||||
/**
|
||||
@ -471,7 +488,20 @@ enum {
|
||||
* A label does not meet the IDNA CONTEXTJ requirements.
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
UIDNA_ERROR_CONTEXTJ=0x1000
|
||||
UIDNA_ERROR_CONTEXTJ=0x1000,
|
||||
/**
|
||||
* A label does not meet the IDNA CONTEXTO requirements for punctuation characters.
|
||||
* Some punctuation characters "Would otherwise have been DISALLOWED"
|
||||
* but are allowed in certain contexts. (RFC 5892)
|
||||
* @draft ICU 49
|
||||
*/
|
||||
UIDNA_ERROR_CONTEXTO_PUNCTUATION=0x2000,
|
||||
/**
|
||||
* A label does not meet the IDNA CONTEXTO requirements for digits.
|
||||
* Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx).
|
||||
* @draft ICU 49
|
||||
*/
|
||||
UIDNA_ERROR_CONTEXTO_DIGITS=0x4000
|
||||
};
|
||||
|
||||
/* IDNA2003 API ------------------------------------------------------------- */
|
||||
|
@ -214,7 +214,7 @@ static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
|
||||
* Must be in order of corresponding UProperty,
|
||||
* and there must be exactly one entry per binary UProperty.
|
||||
*
|
||||
* Properties with mask==0 and contains==NULL are handled in code.
|
||||
* Properties with mask==0 are handled in code.
|
||||
* For them, column is the UPropertySource value.
|
||||
*/
|
||||
{ 1, U_MASK(UPROPS_ALPHABETIC), defaultContains },
|
||||
@ -345,7 +345,7 @@ static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty
|
||||
}
|
||||
|
||||
static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
|
||||
int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getUnicodeProperties(c, -1));
|
||||
int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c));
|
||||
return UPROPS_NTV_GET_TYPE(ntv);
|
||||
}
|
||||
|
||||
@ -421,7 +421,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
|
||||
* Must be in order of corresponding UProperty,
|
||||
* and there must be exactly one entry per int UProperty.
|
||||
*
|
||||
* Properties with mask==0 and getValue==NULL are handled in code.
|
||||
* Properties with mask==0 are handled in code.
|
||||
* For them, column is the UPropertySource value.
|
||||
*/
|
||||
{ UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue },
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Copyright (C) 2002-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -192,10 +192,16 @@ enum {
|
||||
|
||||
#define UPROPS_DT_MASK 0x0000001f
|
||||
|
||||
/**
|
||||
* Gets the main properties value for a code point.
|
||||
* Implemented in uchar.c for uprops.cpp.
|
||||
*/
|
||||
U_CFUNC uint32_t
|
||||
u_getMainProperties(UChar32 c);
|
||||
|
||||
/**
|
||||
* Get a properties vector word for a code point.
|
||||
* Implemented in uchar.c for uprops.c.
|
||||
* column==-1 gets the 32-bit main properties word instead.
|
||||
* Implemented in uchar.c for uprops.cpp.
|
||||
* @return 0 if no data or illegal argument
|
||||
*/
|
||||
U_CFUNC uint32_t
|
||||
|
@ -18,6 +18,7 @@
|
||||
|
||||
#include "unicode/idna.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
@ -188,6 +189,9 @@ private:
|
||||
UBool
|
||||
isLabelOkContextJ(const UChar *label, int32_t labelLength) const;
|
||||
|
||||
void
|
||||
checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
|
||||
|
||||
const Normalizer2 &uts46Norm2; // uts46.nrm
|
||||
uint32_t options;
|
||||
};
|
||||
@ -822,6 +826,9 @@ UTS46::processLabel(UnicodeString &dest,
|
||||
) {
|
||||
info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
|
||||
}
|
||||
if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
|
||||
checkLabelContextO(label, labelLength, info);
|
||||
}
|
||||
if(toASCII) {
|
||||
if(wasPunycode) {
|
||||
// Leave a Punycode label unchanged if it has no severe errors.
|
||||
@ -1171,6 +1178,109 @@ UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void
|
||||
UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
|
||||
int32_t labelEnd=labelLength-1; // inclusive
|
||||
int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx
|
||||
for(int32_t i=0; i<=labelEnd; ++i) {
|
||||
UChar32 c=label[i];
|
||||
if(c<0xb7) {
|
||||
// ASCII fastpath
|
||||
} else if(c<=0x6f9) {
|
||||
if(c==0xb7) {
|
||||
// Appendix A.3. MIDDLE DOT (U+00B7)
|
||||
// Rule Set:
|
||||
// False;
|
||||
// If Before(cp) .eq. U+006C And
|
||||
// After(cp) .eq. U+006C Then True;
|
||||
if(!(0<i && label[i-1]==0x6c &&
|
||||
i<labelEnd && label[i+1]==0x6c)) {
|
||||
info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
|
||||
}
|
||||
} else if(c==0x375) {
|
||||
// Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
|
||||
// Rule Set:
|
||||
// False;
|
||||
// If Script(After(cp)) .eq. Greek Then True;
|
||||
UScriptCode script=USCRIPT_INVALID_CODE;
|
||||
if(i<labelEnd) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t j=i+1;
|
||||
U16_NEXT(label, j, labelLength, c);
|
||||
script=uscript_getScript(c, &errorCode);
|
||||
}
|
||||
if(script!=USCRIPT_GREEK) {
|
||||
info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
|
||||
}
|
||||
} else if(c==0x5f3 || c==0x5f4) {
|
||||
// Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
|
||||
// Rule Set:
|
||||
// False;
|
||||
// If Script(Before(cp)) .eq. Hebrew Then True;
|
||||
//
|
||||
// Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
|
||||
// Rule Set:
|
||||
// False;
|
||||
// If Script(Before(cp)) .eq. Hebrew Then True;
|
||||
UScriptCode script=USCRIPT_INVALID_CODE;
|
||||
if(0<i) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t j=i;
|
||||
U16_PREV(label, 0, j, c);
|
||||
script=uscript_getScript(c, &errorCode);
|
||||
}
|
||||
if(script!=USCRIPT_HEBREW) {
|
||||
info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
|
||||
}
|
||||
} else if(0x660<=c /* && c<=0x6f9 */) {
|
||||
// Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
|
||||
// Rule Set:
|
||||
// True;
|
||||
// For All Characters:
|
||||
// If cp .in. 06F0..06F9 Then False;
|
||||
// End For;
|
||||
//
|
||||
// Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
|
||||
// Rule Set:
|
||||
// True;
|
||||
// For All Characters:
|
||||
// If cp .in. 0660..0669 Then False;
|
||||
// End For;
|
||||
if(c<=0x669) {
|
||||
if(arabicDigits>0) {
|
||||
info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
|
||||
}
|
||||
arabicDigits=-1;
|
||||
} else if(0x6f0<=c) {
|
||||
if(arabicDigits<0) {
|
||||
info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
|
||||
}
|
||||
arabicDigits=1;
|
||||
}
|
||||
}
|
||||
} else if(c==0x30fb) {
|
||||
// Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
|
||||
// Rule Set:
|
||||
// False;
|
||||
// For All Characters:
|
||||
// If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
|
||||
// End For;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
for(int j=0;;) {
|
||||
if(j>labelEnd) {
|
||||
info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
|
||||
break;
|
||||
}
|
||||
U16_NEXT(label, j, labelLength, c);
|
||||
UScriptCode script=uscript_getScript(c, &errorCode);
|
||||
if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uts46test.cpp
|
||||
@ -55,11 +55,12 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
|
||||
logln("TestSuite UTS46Test: ");
|
||||
if(trans==NULL) {
|
||||
IcuTestErrorCode errorCode(*this, "init/createUTS46Instance()");
|
||||
trans=IDNA::createUTS46Instance(
|
||||
UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ,
|
||||
errorCode);
|
||||
uint32_t commonOptions=
|
||||
UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|
|
||||
UIDNA_CHECK_CONTEXTJ|UIDNA_CHECK_CONTEXTO;
|
||||
trans=IDNA::createUTS46Instance(commonOptions, errorCode);
|
||||
nontrans=IDNA::createUTS46Instance(
|
||||
UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ|
|
||||
commonOptions|
|
||||
UIDNA_NONTRANSITIONAL_TO_ASCII|UIDNA_NONTRANSITIONAL_TO_UNICODE,
|
||||
errorCode);
|
||||
if(errorCode.logDataIfFailureAndReset("createUTS46Instance()")) {
|
||||
@ -534,6 +535,29 @@ static const TestCase testCases[]={
|
||||
"\\u06EF\\u200C\\u06EF", UIDNA_ERROR_CONTEXTJ },
|
||||
{ "\\u0644\\u200C", "N", // D ZWNJ
|
||||
"\\u0644\\u200C", UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ },
|
||||
{ "\\u0660\\u0661", "B", // Arabic-Indic Digits alone
|
||||
"\\u0660\\u0661", UIDNA_ERROR_BIDI },
|
||||
{ "\\u06F0\\u06F1", "B", // Extended Arabic-Indic Digits alone
|
||||
"\\u06F0\\u06F1", 0 },
|
||||
{ "\\u0660\\u06F1", "B", // Mixed Arabic-Indic Digits
|
||||
"\\u0660\\u06F1", UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI },
|
||||
// All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
|
||||
// in their correct contexts,
|
||||
// then each in incorrect context.
|
||||
{ "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", "B",
|
||||
"l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", UIDNA_ERROR_BIDI },
|
||||
{ "l\\u00B7", "B",
|
||||
"l\\u00B7", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
|
||||
{ "\\u00B7l", "B",
|
||||
"\\u00B7l", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
|
||||
{ "\\u0375", "B",
|
||||
"\\u0375", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
|
||||
{ "\\u03B1\\u05F3", "B",
|
||||
"\\u03B1\\u05F3", UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI },
|
||||
{ "\\u05F4", "B",
|
||||
"\\u05F4", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
|
||||
{ "l\\u30FB", "B",
|
||||
"l\\u30FB", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
|
||||
// Ticket #8137: UTS #46 toUnicode() fails with non-ASCII labels that turn
|
||||
// into 15 characters (UChars).
|
||||
// The bug was in u_strFromPunycode() which did not write the last character
|
||||
|
Loading…
Reference in New Issue
Block a user