ICU-2466 add IMAP-mailbox-name converter

X-SVN-Rev: 10187
This commit is contained in:
Markus Scherer 2002-11-07 21:02:24 +00:00
parent 54d2cd87e5
commit 442a78aeec
8 changed files with 914 additions and 16 deletions

View File

@ -78,7 +78,7 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={
#endif
&_ASCIIData,
&_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data
&_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data, &_IMAPData
};
/* Please keep this in binary sorted order for getAlgorithmicTypeFromName.
@ -93,6 +93,7 @@ static struct {
{ "cesu8", UCNV_CESU8 },
#if !UCONFIG_NO_LEGACY_CONVERSION
{ "hz",UCNV_HZ },
{ "imapmailboxname", UCNV_IMAP_MAILBOX },
{ "iscii", UCNV_ISCII },
{ "iso2022", UCNV_ISO_2022 },
#endif

View File

@ -158,7 +158,7 @@ extern const UConverterSharedData
_LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
_LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
_HZData,_ISCIIData, _SCSUData, _ASCIIData,
_UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data;
_UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData;
U_CDECL_END

View File

@ -498,7 +498,7 @@ callback:
static UChar32
_UTF7GetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
return ucnv_getNextUCharFromToUImpl(pArgs, _UTF7ToUnicodeWithOffsets, TRUE, pErrorCode);
return ucnv_getNextUCharFromToUImpl(pArgs, pArgs->converter->sharedData->impl->toUnicode, TRUE, pErrorCode);
}
static void
@ -618,7 +618,7 @@ unicodeMode:
if(target<targetLimit) {
*target++=MINUS;
if(offsets!=NULL) {
*offsets++=sourceIndex;
*offsets++=sourceIndex-1;
}
} else {
cnv->charErrorBuffer[0]=MINUS;
@ -744,8 +744,7 @@ unicodeMode:
*offsets++=sourceIndex-1;
}
} else {
cnv->charErrorBuffer[0]=toBase64[bits];
cnv->charErrorBufferLength=1;
cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
}
@ -814,3 +813,738 @@ const UConverterSharedData _UTF7Data={
NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
0
};
/* IMAP mailbox name encoding ----------------------------------------------- */
/*
* RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
* http://www.ietf.org/rfc/rfc2060.txt
*
* 5.1.3. Mailbox International Naming Convention
*
* By convention, international mailbox names are specified using a
* modified version of the UTF-7 encoding described in [UTF-7]. The
* purpose of these modifications is to correct the following problems
* with UTF-7:
*
* 1) UTF-7 uses the "+" character for shifting; this conflicts with
* the common use of "+" in mailbox names, in particular USENET
* newsgroup names.
*
* 2) UTF-7's encoding is BASE64 which uses the "/" character; this
* conflicts with the use of "/" as a popular hierarchy delimiter.
*
* 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
* the use of "\" as a popular hierarchy delimiter.
*
* 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
* the use of "~" in some servers as a home directory indicator.
*
* 5) UTF-7 permits multiple alternate forms to represent the same
* string; in particular, printable US-ASCII chararacters can be
* represented in encoded form.
*
* In modified UTF-7, printable US-ASCII characters except for "&"
* represent themselves; that is, characters with octet values 0x20-0x25
* and 0x27-0x7e. The character "&" (0x26) is represented by the two-
* octet sequence "&-".
*
* All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
* Unicode 16-bit octets) are represented in modified BASE64, with a
* further modification from [UTF-7] that "," is used instead of "/".
* Modified BASE64 MUST NOT be used to represent any printing US-ASCII
* character which can represent itself.
*
* "&" is used to shift to modified BASE64 and "-" to shift back to US-
* ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
* is, a name that ends with a Unicode 16-bit octet MUST end with a "-
* ").
*
* For example, here is a mailbox name which mixes English, Japanese,
* and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
*/
/*
* Tests for US-ASCII characters belonging to character classes
* defined in UTF-7.
*
* Set D (directly encoded characters) consists of the following
* characters: the upper and lower case letters A through Z
* and a through z, the 10 digits 0-9, and the following nine special
* characters (note that "+" and "=" are omitted):
* '(),-./:?
*
* Set O (optional direct characters) consists of the following
* characters (note that "\" and "~" are omitted):
* !"#$%&*;<=>@[]^_`{|}
*
* According to the rules in RFC 2152, the byte values for the following
* US-ASCII characters are not used in UTF-7 and are therefore illegal:
* - all C0 control codes except for CR LF TAB
* - BACKSLASH
* - TILDE
* - DEL
* - all codes beyond US-ASCII, i.e. all >127
*/
/* uses '&' not '+' to start a base64 sequence */
#define AMPERSAND 0x26
#define COMMA 0x2c
#define SLASH 0x2f
/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
/*
* converter status values:
*
* toUnicodeStatus:
* 24 inDirectMode (boolean)
* 23..16 base64Counter (-1..7)
* 15..0 bits (up to 14 bits incoming base64)
*
* fromUnicodeStatus:
* 24 inDirectMode (boolean)
* 23..16 base64Counter (0..2)
* 7..0 bits (6 bits outgoing base64)
*
* ignore bits 31..25
*/
static void
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const uint8_t *source, *sourceLimit;
UChar *target;
const UChar *targetLimit;
int32_t *offsets;
uint8_t *bytes;
uint8_t byteIndex;
int32_t length, targetCapacity;
/* UTF-7 state */
uint16_t bits;
int8_t base64Counter;
UBool inDirectMode;
int8_t base64Value;
int32_t sourceIndex, nextSourceIndex;
UChar c;
uint8_t b;
/* set up the local pointers */
cnv=pArgs->converter;
source=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
target=pArgs->target;
targetLimit=pArgs->targetLimit;
offsets=pArgs->offsets;
/* get the state machine state */
{
uint32_t status=cnv->toUnicodeStatus;
inDirectMode=(UBool)((status>>24)&1);
base64Counter=(int8_t)(status>>16);
bits=(uint16_t)status;
}
bytes=cnv->toUBytes;
byteIndex=cnv->toULength;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
loop:
if(inDirectMode) {
directMode:
/*
* In Direct Mode, US-ASCII characters are encoded directly, i.e.,
* with their US-ASCII byte values.
* An ampersand starts Unicode (or "escape") Mode.
*
* In Direct Mode, only the sourceIndex is used.
*/
byteIndex=0;
length=sourceLimit-source;
targetCapacity=targetLimit-target;
if(length>targetCapacity) {
length=targetCapacity;
}
while(length>0) {
b=*source++;
if(!isLegalIMAP(b)) {
/* illegal */
bytes[0]=b;
byteIndex=1;
nextSourceIndex=sourceIndex+1;
goto callback;
} else if(b!=AMPERSAND) {
/* write directly encoded character */
*target++=b;
if(offsets!=NULL) {
*offsets++=sourceIndex++;
}
} else /* AMPERSAND */ {
/* switch to Unicode mode */
nextSourceIndex=++sourceIndex;
inDirectMode=FALSE;
byteIndex=0;
bits=0;
base64Counter=-1;
goto unicodeMode;
}
--length;
}
if(source<sourceLimit && target>=targetLimit) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
} else {
unicodeMode:
/*
* In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
* The base64 sequence ends with any character that is not in the base64 alphabet.
* A terminating minus sign is consumed.
* US-ASCII must not be base64-ed.
*
* In Unicode Mode, the sourceIndex has the index to the start of the current
* base64 bytes, while nextSourceIndex is precisely parallel to source,
* keeping the index to the following byte.
* Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
*/
while(source<sourceLimit) {
if(target<targetLimit) {
bytes[byteIndex++]=b=*source++;
++nextSourceIndex;
if(b>0x7e) {
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
inDirectMode=TRUE;
goto callback;
} else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
/* collect base64 bytes into UChars */
switch(base64Counter) {
case -1: /* -1 is immediately after the & */
case 0:
bits=base64Value;
base64Counter=1;
break;
case 1:
case 3:
case 4:
case 6:
bits=(uint16_t)((bits<<6)|base64Value);
++base64Counter;
break;
case 2:
c=(UChar)((bits<<4)|(base64Value>>2));
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
goto callback;
}
*target++=c;
if(offsets!=NULL) {
*offsets++=sourceIndex;
sourceIndex=nextSourceIndex-1;
}
bytes[0]=b; /* keep this byte in case an error occurs */
byteIndex=1;
bits=(uint16_t)(base64Value&3);
base64Counter=3;
break;
case 5:
c=(UChar)((bits<<2)|(base64Value>>4));
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
goto callback;
}
*target++=c;
if(offsets!=NULL) {
*offsets++=sourceIndex;
sourceIndex=nextSourceIndex-1;
}
bytes[0]=b; /* keep this byte in case an error occurs */
byteIndex=1;
bits=(uint16_t)(base64Value&15);
base64Counter=6;
break;
case 7:
c=(UChar)((bits<<6)|base64Value);
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
goto callback;
}
*target++=c;
if(offsets!=NULL) {
*offsets++=sourceIndex;
sourceIndex=nextSourceIndex;
}
byteIndex=0;
bits=0;
base64Counter=0;
break;
default:
/* will never occur */
break;
}
} else if(base64Value==-2) {
/* minus sign terminates the base64 sequence */
inDirectMode=TRUE;
if(base64Counter==-1) {
/* &- i.e. a minus immediately following an ampersand */
*target++=AMPERSAND;
if(offsets!=NULL) {
*offsets++=sourceIndex-1;
}
} else {
/* absorb the minus and leave the Unicode Mode */
if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
/* bits are illegally left over, a UChar is incomplete */
/* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
goto callback;
}
}
sourceIndex=nextSourceIndex;
goto directMode;
} else {
if(base64Counter==-1) {
/* illegal: & immediately followed by something other than base64 or minus sign */
/* include the ampersand in the reported sequence */
--sourceIndex;
bytes[0]=AMPERSAND;
bytes[1]=b;
byteIndex=2;
}
/* base64Value==-1 for characters that are illegal only in Unicode mode */
/* base64Value==-3 for illegal characters */
/* illegal */
inDirectMode=TRUE;
goto callback;
}
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
}
endloop:
if(pArgs->flush && source>=sourceLimit) {
/* reset the state for the next conversion */
if(!inDirectMode && U_SUCCESS(*pErrorCode)) {
/* a character byte sequence remains incomplete - IMAP must end in ASCII/direct mode */
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
cnv->toULength=0;
} else {
/* set the converter state back into UConverter */
cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
cnv->toULength=byteIndex;
}
finish:
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
callback:
/* call the callback function with all the preparations and post-processing */
/* update the arguments structure */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
/* copy the current bytes to invalidCharBuffer */
for(b=0; b<(uint8_t)byteIndex; ++b) {
cnv->invalidCharBuffer[b]=(char)bytes[b];
}
cnv->invalidCharLength=byteIndex;
/* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
cnv->toULength=0;
/* call the callback function */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
/* get the converter state from UConverter */
{
uint32_t status=cnv->toUnicodeStatus;
inDirectMode=(UBool)((status>>24)&1);
base64Counter=(int8_t)(status>>16);
bits=(uint16_t)status;
}
byteIndex=cnv->toULength;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
target=pArgs->target;
/* update the source pointer and index */
sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
source=(const uint8_t *)pArgs->source;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
goto endloop;
} else if(cnv->UCharErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
goto endloop;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
cnv->toULength=0;
goto finish;
} else {
goto loop;
}
}
static void
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const UChar *source, *sourceLimit;
uint8_t *target, *targetLimit;
int32_t *offsets;
int32_t length, targetCapacity, sourceIndex;
UChar c;
uint8_t b;
/* UTF-7 state */
uint8_t bits;
int8_t base64Counter;
UBool inDirectMode;
/* set up the local pointers */
cnv=pArgs->converter;
/* set up the local pointers */
source=pArgs->source;
sourceLimit=pArgs->sourceLimit;
target=(uint8_t *)pArgs->target;
targetLimit=(uint8_t *)pArgs->targetLimit;
offsets=pArgs->offsets;
/* get the state machine state */
{
uint32_t status=cnv->fromUnicodeStatus;
inDirectMode=(UBool)((status>>24)&1);
base64Counter=(int8_t)(status>>16);
bits=(uint8_t)status;
}
/* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
sourceIndex=0;
if(inDirectMode) {
directMode:
length=sourceLimit-source;
targetCapacity=targetLimit-target;
if(length>targetCapacity) {
length=targetCapacity;
}
while(length>0) {
c=*source++;
/* encode 0x20..0x7e except '&' directly */
if(inSetDIMAP(c)) {
/* encode directly */
*target++=(uint8_t)c;
if(offsets!=NULL) {
*offsets++=sourceIndex++;
}
} else if(c==AMPERSAND) {
/* output &- for & */
*target++=AMPERSAND;
if(target<targetLimit) {
*target++=MINUS;
if(offsets!=NULL) {
*offsets++=sourceIndex;
*offsets++=sourceIndex++;
}
/* realign length and targetCapacity */
goto directMode;
} else {
if(offsets!=NULL) {
*offsets++=sourceIndex++;
}
cnv->charErrorBuffer[0]=MINUS;
cnv->charErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
} else {
/* un-read this character and switch to Unicode Mode */
--source;
*target++=AMPERSAND;
if(offsets!=NULL) {
*offsets++=sourceIndex;
}
inDirectMode=FALSE;
base64Counter=0;
goto unicodeMode;
}
--length;
}
if(source<sourceLimit && target>=targetLimit) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
} else {
unicodeMode:
while(source<sourceLimit) {
if(target<targetLimit) {
c=*source++;
if(isLegalIMAP(c)) {
/* encode directly */
inDirectMode=TRUE;
/* trick: back out this character to make this easier */
--source;
/* terminate the base64 sequence */
if(base64Counter!=0) {
/* write remaining bits for the previous character */
*target++=TO_BASE64_IMAP(bits);
if(offsets!=NULL) {
*offsets++=sourceIndex-1;
}
}
/* need to terminate with a minus */
if(target<targetLimit) {
*target++=MINUS;
if(offsets!=NULL) {
*offsets++=sourceIndex-1;
}
} else {
cnv->charErrorBuffer[0]=MINUS;
cnv->charErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
goto directMode;
} else {
/*
* base64 this character:
* Output 2 or 3 base64 bytes for the remaining bits of the previous character
* and the bits of this character, each implicitly in UTF-16BE.
*
* Here, bits is an 8-bit variable because only 6 bits need to be kept from one
* character to the next. The actual 2 or 4 bits are shifted to the left edge
* of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
*/
switch(base64Counter) {
case 0:
b=(uint8_t)(c>>10);
*target++=TO_BASE64_IMAP(b);
if(target<targetLimit) {
b=(uint8_t)((c>>4)&0x3f);
*target++=TO_BASE64_IMAP(b);
if(offsets!=NULL) {
*offsets++=sourceIndex;
*offsets++=sourceIndex++;
}
} else {
if(offsets!=NULL) {
*offsets++=sourceIndex++;
}
b=(uint8_t)((c>>4)&0x3f);
cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
cnv->charErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
bits=(uint8_t)((c&15)<<2);
base64Counter=1;
break;
case 1:
b=(uint8_t)(bits|(c>>14));
*target++=TO_BASE64_IMAP(b);
if(target<targetLimit) {
b=(uint8_t)((c>>8)&0x3f);
*target++=TO_BASE64_IMAP(b);
if(target<targetLimit) {
b=(uint8_t)((c>>2)&0x3f);
*target++=TO_BASE64_IMAP(b);
if(offsets!=NULL) {
*offsets++=sourceIndex;
*offsets++=sourceIndex;
*offsets++=sourceIndex++;
}
} else {
if(offsets!=NULL) {
*offsets++=sourceIndex;
*offsets++=sourceIndex++;
}
b=(uint8_t)((c>>2)&0x3f);
cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
cnv->charErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
} else {
if(offsets!=NULL) {
*offsets++=sourceIndex++;
}
b=(uint8_t)((c>>8)&0x3f);
cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
b=(uint8_t)((c>>2)&0x3f);
cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
cnv->charErrorBufferLength=2;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
bits=(uint8_t)((c&3)<<4);
base64Counter=2;
break;
case 2:
b=(uint8_t)(bits|(c>>12));
*target++=TO_BASE64_IMAP(b);
if(target<targetLimit) {
b=(uint8_t)((c>>6)&0x3f);
*target++=TO_BASE64_IMAP(b);
if(target<targetLimit) {
b=(uint8_t)(c&0x3f);
*target++=TO_BASE64_IMAP(b);
if(offsets!=NULL) {
*offsets++=sourceIndex;
*offsets++=sourceIndex;
*offsets++=sourceIndex++;
}
} else {
if(offsets!=NULL) {
*offsets++=sourceIndex;
*offsets++=sourceIndex++;
}
b=(uint8_t)(c&0x3f);
cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
cnv->charErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
} else {
if(offsets!=NULL) {
*offsets++=sourceIndex++;
}
b=(uint8_t)((c>>6)&0x3f);
cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
b=(uint8_t)(c&0x3f);
cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
cnv->charErrorBufferLength=2;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
bits=0;
base64Counter=0;
break;
default:
/* will never occur */
break;
}
}
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
}
if(pArgs->flush && source>=sourceLimit) {
/* flush remaining bits to the target */
if(!inDirectMode) {
if(base64Counter!=0) {
if(target<targetLimit) {
*target++=TO_BASE64_IMAP(bits);
if(offsets!=NULL) {
*offsets++=sourceIndex-1;
}
} else {
cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
}
/* need to terminate with a minus */
if(target<targetLimit) {
*target++=MINUS;
if(offsets!=NULL) {
*offsets++=sourceIndex-1;
}
} else {
cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
}
/* reset the state for the next conversion */
cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
} else {
/* set the converter state back into UConverter */
cnv->fromUnicodeStatus=
(cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
}
/* write back the updated pointers */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
return;
}
static const UConverterImpl _IMAPImpl={
UCNV_IMAP_MAILBOX,
NULL,
NULL,
_UTF7Open,
NULL,
_UTF7Reset,
_IMAPToUnicodeWithOffsets,
_IMAPToUnicodeWithOffsets,
_IMAPFromUnicodeWithOffsets,
_IMAPFromUnicodeWithOffsets,
_UTF7GetNextUChar,
NULL,
NULL,
NULL /* we don't need writeSub() because we never call a callback at fromUnicode() */
};
static const UConverterStaticData _IMAPStaticData={
sizeof(UConverterStaticData),
"IMAP-mailbox-name",
0, /* TODO CCSID for UTF-7 */
UCNV_IBM, UCNV_IMAP_MAILBOX,
1, 4,
{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
FALSE, FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _IMAPData={
sizeof(UConverterSharedData), ~((uint32_t)0),
NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
0
};

View File

@ -104,6 +104,7 @@ typedef enum {
UCNV_UTF16,
UCNV_UTF32,
UCNV_CESU8,
UCNV_IMAP_MAILBOX,
/* Number of converter types for which we have conversion routines. */
UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES

View File

@ -148,6 +148,14 @@ UTF32_OppositeEndian
# For details about email headers see RFC 2047.
UTF-7 { IANA* MIME* } cp65000
# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names.
# It is a substantially modified UTF-7 encoding. See the specification in:
#
# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
# (http://www.ietf.org/rfc/rfc2060.txt)
# Section 5.1.3. Mailbox International Naming Convention
IMAP-mailbox-name
SCSU { IANA* }
BOCU-1 { IANA* } csBOCU-1 { IANA }

View File

@ -1707,6 +1707,26 @@ static void TestSub(int32_t inputsize, int32_t outputsize)
}
}
log_verbose("Testing IMAP-mailbox-name toUnicode with substitute callbacks\n");
{
static const uint8_t bytes[]={
/* aDEL a&AB~ a&AB\x0c a&AB- a&AB. a&. */
0x61, 0x7f, 0x61, 0x26, 0x41, 0x42, 0x7e, 0x61, 0x26, 0x41, 0x42, 0x0c, 0x61, 0x26, 0x41, 0x42, 0x2d, 0x61, 0x26, 0x41, 0x42, 0x2e, 0x61, 0x26, 0x2e
};
static const UChar unicode[]={
0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd
};
static const int32_t offsets[]={
0, 1, 2, 4, 7, 9, 12, 14, 17, 19, 22, 23
};
if(!testConvertToUnicode(bytes, ARRAY_LENGTH(bytes), unicode, ARRAY_LENGTH(unicode), "IMAP-mailbox-name",
UCNV_TO_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0)
) {
log_err("IMAP-mailbox-name->u with substitute did not match.\n");
}
}
log_verbose("Testing UTF-16 toUnicode with substitute callbacks\n");
{
static const uint8_t

View File

@ -1772,11 +1772,16 @@ doTestTruncated(const char *cnvName, const uint8_t *bytes, int32_t length) {
static void
TestTruncated() {
struct {
static const struct {
const char *cnvName;
uint8_t bytes[8]; /* partial input bytes resulting in no output */
int32_t length;
} testCases[]={
{ "IMAP-mailbox-name", { 0x26 }, 1 }, /* & */
{ "IMAP-mailbox-name", { 0x26, 0x42 }, 2 }, /* &B */
{ "IMAP-mailbox-name", { 0x26, 0x42, 0x42 }, 3 }, /* &BB */
{ "IMAP-mailbox-name", { 0x26, 0x41, 0x41 }, 3 }, /* &AA */
{ "UTF-7", { 0x2b, 0x42 }, 2 }, /* +B */
{ "UTF-8", { 0xd1 }, 1 },

View File

@ -34,6 +34,7 @@ static void TestConverterTypesAndStarters(void);
static void TestAmbiguous(void);
static void TestSignatureDetection(void);
static void TestUTF7(void);
static void TestIMAP(void);
static void TestUTF8(void);
static void TestCESU8(void);
static void TestUTF16(void);
@ -145,7 +146,9 @@ TestNextUChar(UConverter* cnv, const char* source, const char* limit, const uint
while(s<limit) {
s0=s;
c=ucnv_getNextUChar(cnv, &s, limit, &errorCode);
if(U_FAILURE(errorCode)) {
if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
break; /* no more significant input */
} else if(U_FAILURE(errorCode)) {
log_err("%s ucnv_getNextUChar() failed: %s\n", message, u_errorName(errorCode));
break;
} else if((uint32_t)(s-s0)!=*r || c!=*(r+1)) {
@ -210,6 +213,7 @@ void addTestNewConvert(TestNode** root)
addTest(root, &TestAmbiguous, "tsconv/nucnvtst/TestAmbiguous");
addTest(root, &TestSignatureDetection, "tsconv/nucnvtst/TestSignatureDetection");
addTest(root, &TestUTF7, "tsconv/nucnvtst/TestUTF7");
addTest(root, &TestIMAP, "tsconv/nucnvtst/TestIMAP");
addTest(root, &TestUTF8, "tsconv/nucnvtst/TestUTF8");
addTest(root, &TestCESU8, "tsconv/nucnvtst/TestCESU8");
addTest(root, &TestUTF16, "tsconv/nucnvtst/TestUTF16");
@ -404,7 +408,7 @@ static ETestConvertResult testConvertFromU( const UChar *source, int sourceLen,
log_err("\n");
log_err("Got : ");
for(p=junkout;p<targ;p++) {
log_err("%d, ", junokout[p-junkout]);
log_err("%d,", junokout[p-junkout]);
}
log_err("\n");
log_err("Expected: ");
@ -880,7 +884,7 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize )
27, 29, 32
};
static const int32_t fromUnicodeOffsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 9, 10,
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10,
11, 12, 12, 12, 13, 13, 13, 13, 14,
15, 15,
16, 16, 16, 17, 17, 17, 18, 18, 18
@ -906,19 +910,95 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize )
31, 33, 36
};
static const int32_t fromUnicodeOffsetsR[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10, 10, 11,
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10, 10, 10, 10, 10,
11, 12, 12, 12, 13, 13, 13, 13, 14,
15, 15,
16, 16, 16, 17, 17, 17, 18, 18, 18
};
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, utf7, sizeof(utf7), "UTF-7", fromUnicodeOffsets,FALSE);
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, utf7, sizeof(utf7), "UTF-7", fromUnicodeOffsets,FALSE);
testConvertToU(utf7, sizeof(utf7), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "UTF-7", toUnicodeOffsets,FALSE);
testConvertToU(utf7, sizeof(utf7), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "UTF-7", toUnicodeOffsets,FALSE);
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, utf7Restricted, sizeof(utf7Restricted), "UTF-7,version=1", fromUnicodeOffsetsR,FALSE);
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, utf7Restricted, sizeof(utf7Restricted), "UTF-7,version=1", fromUnicodeOffsetsR,FALSE);
testConvertToU(utf7Restricted, sizeof(utf7Restricted), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "UTF-7,version=1", toUnicodeOffsetsR,FALSE);
testConvertToU(utf7Restricted, sizeof(utf7Restricted), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "UTF-7,version=1", toUnicodeOffsetsR,FALSE);
}
/*
* IMAP-mailbox-name examples are mostly from http://www.imc.org/rfc2152,
* modified according to RFC 2060,
* and supplemented with the one example in RFC 2060 itself.
*/
{
static const uint8_t imap[] = {
/* Hi Mom -&Jjo--!
A&ImIDkQ-.
&-
&ZeVnLIqe-
\
~peter
/mail
/&ZeVnLIqe-
/&U,BTFw-
*/
0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x26, 0x4a, 0x6a, 0x6f, 0x2d, 0x2d, 0x21,
0x41, 0x26, 0x49, 0x6d, 0x49, 0x44, 0x6b, 0x51, 0x2d, 0x2e,
0x26, 0x2d,
0x26, 0x5a, 0x65, 0x56, 0x6e, 0x4c, 0x49, 0x71, 0x65, 0x2d,
0x5c,
0x7e, 0x70, 0x65, 0x74, 0x65, 0x72,
0x2f, 0x6d, 0x61, 0x69, 0x6c,
0x2f, 0x26, 0x5a, 0x65, 0x56, 0x6e, 0x4c, 0x49, 0x71, 0x65, 0x2d,
0x2f, 0x26, 0x55, 0x2c, 0x42, 0x54, 0x46, 0x77, 0x2d
};
static const UChar unicode[] = {
/* Hi Mom -<WHITE SMILING FACE>-!
A<NOT IDENTICAL TO><ALPHA>.
&
[Japanese word "nihongo"]
\
~peter
/mail
/<65e5, 672c, 8a9e>
/<53f0, 5317>
*/
0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x263a, 0x2d, 0x21,
0x41, 0x2262, 0x0391, 0x2e,
0x26,
0x65e5, 0x672c, 0x8a9e,
0x5c,
0x7e, 0x70, 0x65, 0x74, 0x65, 0x72,
0x2f, 0x6d, 0x61, 0x69, 0x6c,
0x2f, 0x65e5, 0x672c, 0x8a9e,
0x2f, 0x53f0, 0x5317
};
static const int32_t toUnicodeOffsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 9, 13, 14,
15, 17, 19, 24,
25,
28, 30, 33,
37,
38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48,
49, 51, 53, 56,
60, 62, 64
};
static const int32_t fromUnicodeOffsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10,
11, 12, 12, 12, 13, 13, 13, 13, 13, 14,
15, 15,
16, 16, 16, 17, 17, 17, 18, 18, 18, 18,
19,
20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30,
31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
35, 36, 36, 36, 37, 37, 37, 37, 37
};
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, imap, sizeof(imap), "IMAP-mailbox-name", fromUnicodeOffsets,FALSE);
testConvertToU(imap, sizeof(imap), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "IMAP-mailbox-name", toUnicodeOffsets,FALSE);
}
/* Test UTF-8 bad data handling*/
@ -1224,6 +1304,7 @@ static void TestConverterTypesAndStarters()
TestConverterType("x-iscii-de", UCNV_ISCII);
TestConverterType("ascii", UCNV_US_ASCII);
TestConverterType("utf-7", UCNV_UTF7);
TestConverterType("IMAP-mailbox-name", UCNV_IMAP_MAILBOX);
TestConverterType("bocu-1", UCNV_BOCU1);
}
@ -1595,6 +1676,50 @@ static TestUTF7() {
ucnv_close(cnv);
}
void
static TestIMAP() {
/* test input */
static const uint8_t in[]={
/* H - &Jjo- - ! &- &2AHcAQ- \ */
0x48,
0x2d,
0x26, 0x4a, 0x6a, 0x6f,
0x2d, 0x2d,
0x21,
0x26, 0x2d,
0x26, 0x32, 0x41, 0x48, 0x63, 0x41, 0x51, 0x2d
};
/* expected test results */
static const uint32_t results[]={
/* number of bytes read, code point */
1, 0x48,
1, 0x2d,
4, 0x263a, /* <WHITE SMILING FACE> */
2, 0x2d,
1, 0x21,
2, 0x26,
7, 0x10401
};
const char *cnvName;
const char *source=(const char *)in, *limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("IMAP-mailbox-name", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a IMAP-mailbox-name converter: %s\n", u_errorName(errorCode)); /* sholdn't be a data err */
return;
}
TestNextUChar(cnv, source, limit, results, "IMAP-mailbox-name");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
cnvName = ucnv_getName(cnv, &errorCode);
if (U_FAILURE(errorCode) || uprv_strcmp(cnvName, "IMAP-mailbox-name") != 0) {
log_err("IMAP-mailbox-name converter is called %s: %s\n", cnvName, u_errorName(errorCode));
}
ucnv_close(cnv);
}
void
static TestUTF8() {
/* test input */
@ -2596,7 +2721,9 @@ TestGetNextUChar2022(UConverter* cnv, const char* source, const char* limit,
while(s<limit) {
s0=s;
c=ucnv_getNextUChar(cnv, &s, limit, &errorCode);
if(U_FAILURE(errorCode)) {
if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
break; /* no more significant input */
} else if(U_FAILURE(errorCode)) {
log_err("%s ucnv_getNextUChar() failed: %s\n", message, u_errorName(errorCode));
break;
} else {
@ -3196,6 +3323,8 @@ TestRoundTrippingAllUTF(void){
TestFullRoundtrip("UTF-7");
log_verbose("Running exhaustive round trip test for UTF-7\n");
TestFullRoundtrip("UTF-7,version=1");
log_verbose("Running exhaustive round trip test for IMAP-mailbox-name\n");
TestFullRoundtrip("IMAP-mailbox-name");
log_verbose("Running exhaustive round trip test for GB18030\n");
TestFullRoundtrip("GB18030");
}