iconv: Add UTF-7-IMAP variant in utf-7.c

UTF-7-IMAP differs from UTF-7 in the followings ways (see RFC 3501[1]
for reference) :

- The shift character is '&' instead of '+'
- There is no "optional direct characters" and the "direct characters"
  set is different
- There is no implicit shift back to US-ASCII from BASE64, all BASE64
  sequences MUST be terminated with '-'

[1]: https://datatracker.ietf.org/doc/html/rfc3501#section-5.1.3

Signed-off-by: Max Gautier <mg@max.gautier.name>
Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
This commit is contained in:
Max Gautier 2022-03-21 09:25:05 -03:00 committed by Adhemerval Zanella
parent ef7b963280
commit 9df157b4ed
5 changed files with 62 additions and 6 deletions

View File

@ -94,6 +94,7 @@ EUC-TW EUC-TW Y UTF8
GBK GBK Y UTF8
BIG5HKSCS BIG5HKSCS Y UTF8
UTF-7 UTF-7 N UTF8
UTF-7-IMAP UTF-7-IMAP N UTF8
IBM856 IBM856 N UTF8
IBM922 IBM922 Y UTF8
IBM930 IBM930 N UTF8

View File

@ -113,3 +113,7 @@ module INTERNAL UTF-32BE// UTF-32 1
alias UTF7// UTF-7//
module UTF-7// INTERNAL UTF-7 1
module INTERNAL UTF-7// UTF-7 1
# from to module cost
module UTF-7-IMAP// INTERNAL UTF-7 1
module INTERNAL UTF-7-IMAP// UTF-7 1

1
iconvdata/testdata/UTF-7-IMAP vendored Normal file
View File

@ -0,0 +1 @@
&EqASGxItEps- Amharic&AAoBDQ-esky Czech&AAo-Dansk Danish&AAo-English English&AAo-Suomi Finnish&AAo-Fran&AOc-ais French&AAo-Deutsch German&AAoDlQO7A7sDtwO9A7kDugOs- Greek&AAoF4gXRBegF2QXq- Hebrew&AAo-Italiano Italian&AAo-Norsk Norwegian&AAoEIARDBEEEQQQ6BDgEOQ- Russian&AAo-Espa&APE-ol Spanish&AAo-Svenska Swedish&AAoOIA4yDikOMg5EDhcOIg- Thai&AAo-T&APw-rk&AOc-e Turkish&AAo-Ti&Hr8-ng Vi&Hsc-t Vietnamese&AApl5Wcsip4- Japanese&AApOLWWH- Chinese&AArVXK4A- Korean&AAoACg-// Checking for correct handling of shift characters ('&-', '-') after base64 sequences&AArVXK4A-&-&AArVXK4A--&AAoACg-// Checking for correct handling of litteral '&-' and '-'&AAo----&-&--&AAoACg-// The last line of this file is missing the end-of-line terminator&AAo-// on purpose, in order to test that the conversion empties the bit buffer&AAo-// and shifts back to the initial state at the end of the conversion.&AAo-A&ImIDkQ-

32
iconvdata/testdata/UTF-7-IMAP..UTF8 vendored Normal file
View File

@ -0,0 +1,32 @@
አማርኛ Amharic
česky Czech
Dansk Danish
English English
Suomi Finnish
Français French
Deutsch German
Ελληνικά Greek
עברית Hebrew
Italiano Italian
Norsk Norwegian
Русский Russian
Español Spanish
Svenska Swedish
ภาษาไทย Thai
Türkçe Turkish
Tiếng Việt Vietnamese
日本語 Japanese
中文 Chinese
한글 Korean
// Checking for correct handling of shift characters ('&', '-') after base64 sequences
한글&
한글-
// Checking for correct handling of litteral '&' and '-'
---&&-
// The last line of this file is missing the end-of-line terminator
// on purpose, in order to test that the conversion empties the bit buffer
// and shifts back to the initial state at the end of the conversion.
A≢Α

View File

@ -33,11 +33,13 @@
enum variant
{
UTF7,
UTF_7_IMAP
};
/* Must be in the same order as enum variant above. */
static const char names[] =
"UTF-7//\0"
"UTF-7-IMAP//\0"
"\0";
static uint32_t
@ -45,6 +47,8 @@ shift_character (enum variant const var)
{
if (var == UTF7)
return '+';
else if (var == UTF_7_IMAP)
return '&';
else
abort ();
}
@ -58,6 +62,9 @@ between (uint32_t const ch,
/* The set of "direct characters":
A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
FOR UTF-7-IMAP
A-Z a-z 0-9 ' ( ) , - . / : ? space
! " # $ % + * ; < = > @ [ \ ] ^ _ ` { | } ~
*/
static bool
@ -71,6 +78,8 @@ isdirect (uint32_t ch, enum variant var)
|| between (ch, ',', '/')
|| ch == ':' || ch == '?'
|| ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
else if (var == UTF_7_IMAP)
return (ch != '&' && between (ch, ' ', '~'));
abort ();
}
@ -124,6 +133,8 @@ base64 (unsigned int i, enum variant var)
return '+';
else if (i == 63 && var == UTF7)
return '/';
else if (i == 63 && var == UTF_7_IMAP)
return ',';
else
abort ();
}
@ -308,7 +319,8 @@ gconv_end (struct __gconv_step *data)
i = ch - '0' + 52; \
else if (ch == '+') \
i = 62; \
else if (ch == '/') \
else if ((var == UTF7 && ch == '/') \
|| (var == UTF_7_IMAP && ch == ',')) \
i = 63; \
else \
{ \
@ -316,8 +328,10 @@ gconv_end (struct __gconv_step *data)
\
/* If accumulated data is nonzero, the input is invalid. */ \
/* Also, partial UTF-16 characters are invalid. */ \
if (__builtin_expect (statep->__value.__wch != 0, 0) \
|| __builtin_expect ((statep->__count >> 3) <= 26, 0)) \
/* In IMAP variant, must be terminated by '-'. */ \
if (__glibc_unlikely (statep->__value.__wch != 0) \
|| __glibc_unlikely ((statep->__count >> 3) <= 26) \
|| __glibc_unlikely (var == UTF_7_IMAP && ch != '-')) \
{ \
STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1)); \
} \
@ -474,13 +488,15 @@ gconv_end (struct __gconv_step *data)
else \
{ \
/* base64 encoding active */ \
if (isdirect (ch, var)) \
if ((var == UTF_7_IMAP && ch == '&') || isdirect (ch, var)) \
{ \
/* deactivate base64 encoding */ \
size_t count; \
\
count = ((statep->__count & 0x18) >= 0x10) \
+ needs_explicit_shift (ch) + 1; \
+ (var == UTF_7_IMAP || needs_explicit_shift (ch)) \
+ (var == UTF_7_IMAP && ch == '&') \
+ 1; \
if (__glibc_unlikely (outptr + count > outend)) \
{ \
result = __GCONV_FULL_OUTPUT; \
@ -489,9 +505,11 @@ gconv_end (struct __gconv_step *data)
\
if ((statep->__count & 0x18) >= 0x10) \
*outptr++ = base64 ((statep->__count >> 3) & ~3, var); \
if (needs_explicit_shift (ch)) \
if (var == UTF_7_IMAP || needs_explicit_shift (ch)) \
*outptr++ = '-'; \
*outptr++ = (unsigned char) ch; \
if (var == UTF_7_IMAP && ch == '&') \
*outptr++ = '-'; \
statep->__count = 0; \
} \
else \