ICU-12766 aix/escaper: WIP. Support u8
* support u8 * main tools now builds on AIX * remove c'tors seen as duplicate under AIX * successful build under AIX X-SVN-Rev: 39818
This commit is contained in:
parent
4abad560e4
commit
b9ecfe69c3
@ -194,7 +194,7 @@ EXPAND_ONLY_PREDEF = YES
|
||||
SEARCH_INCLUDES = YES
|
||||
INCLUDE_PATH =
|
||||
INCLUDE_FILE_PATTERNS =
|
||||
PREDEFINED = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END= U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE= U_FINAL= UCONFIG_ENABLE_PLUGINS=1
|
||||
PREDEFINED = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END= U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE= U_FINAL= UCONFIG_ENABLE_PLUGINS=1 U_CHAR16_IS_TYPEDEF=0 U_CPLUSPLUS_VERSION=11 U_NO_NULLPTR_T=0
|
||||
EXPAND_AS_DEFINED =
|
||||
SKIP_FUNCTION_MACROS = YES
|
||||
#---------------------------------------------------------------------------
|
||||
|
@ -659,7 +659,7 @@ ConvertFile::convertFile(const char *pname,
|
||||
parse.line = -1;
|
||||
|
||||
if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
|
||||
t = Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str, UTRANS_FORWARD, parse, err);
|
||||
t = Transliterator::createFromRules(UnicodeString(u"Uconv"), str, UTRANS_FORWARD, parse, err);
|
||||
} else {
|
||||
t = Transliterator::createInstance(UnicodeString(translit, -1, US_INV), UTRANS_FORWARD, err);
|
||||
}
|
||||
|
@ -13,6 +13,19 @@
|
||||
// with caution:
|
||||
#include "unicode/utf8.h"
|
||||
|
||||
static const char
|
||||
kSPACE = 0x20,
|
||||
kTAB = 0x09,
|
||||
kLF = 0x0A,
|
||||
kCR = 0x0D,
|
||||
kHASH = 0x23,
|
||||
kSLASH = 0x2f,
|
||||
kSTAR = 0x2A,
|
||||
kL_U = 0x75,
|
||||
kU_U = 0x55,
|
||||
kQUOT = 0x27,
|
||||
kDBLQ = 0x22;
|
||||
|
||||
std::string prog;
|
||||
|
||||
void usage() {
|
||||
@ -39,6 +52,7 @@ int cleanup(const std::string &outfile) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
inline bool hasNonAscii(const char *line, size_t len) {
|
||||
const unsigned char *uline = reinterpret_cast<const unsigned char*>(line);
|
||||
for(size_t i=0;i<len; i++) {
|
||||
@ -48,14 +62,15 @@ inline bool hasNonAscii(const char *line, size_t len) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline const char *skipws(const char *p, const char *e) {
|
||||
for(;p<e;p++) {
|
||||
switch(*p) {
|
||||
case ' ':
|
||||
case '\t':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case kSPACE:
|
||||
case kTAB:
|
||||
case kLF:
|
||||
case kCR:
|
||||
break;
|
||||
default:
|
||||
return p; // non ws
|
||||
@ -64,6 +79,7 @@ inline const char *skipws(const char *p, const char *e) {
|
||||
return p;
|
||||
}
|
||||
|
||||
#if 0
|
||||
inline bool isCommentOrEmpty(const char* line, size_t len) {
|
||||
const char *p = line;
|
||||
const char *e = line+len;
|
||||
@ -73,13 +89,13 @@ inline bool isCommentOrEmpty(const char* line, size_t len) {
|
||||
}
|
||||
p++;
|
||||
switch(*p) {
|
||||
case '#': return true; // #directive
|
||||
case '/':
|
||||
case kHASH: return true; // #directive
|
||||
case kSLASH:
|
||||
p++;
|
||||
if(p==e) return false; // single slash
|
||||
switch(*p) {
|
||||
case '/': // '/ /'
|
||||
case '*': // '/ *'
|
||||
case kSLASH: // '/ /'
|
||||
case kSTAR: // '/ *'
|
||||
return true; // start of comment
|
||||
default: return false; // something else
|
||||
}
|
||||
@ -87,6 +103,82 @@ inline bool isCommentOrEmpty(const char* line, size_t len) {
|
||||
}
|
||||
/*NOTREACHED*/
|
||||
}
|
||||
#endif
|
||||
|
||||
void appendByte(std::string &outstr,
|
||||
uint8_t byte) {
|
||||
char tmp2[4];
|
||||
sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
|
||||
outstr += tmp2;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true on failure
|
||||
*/
|
||||
bool appendUtf8(std::string &outstr,
|
||||
const std::string &linestr,
|
||||
size_t &pos,
|
||||
size_t chars) {
|
||||
char tmp[9];
|
||||
for(size_t i=0;i<chars;i++) {
|
||||
tmp[i] = linestr[++pos];
|
||||
}
|
||||
tmp[chars] = 0;
|
||||
UChar32 ch;
|
||||
sscanf(tmp, "%X", &ch);
|
||||
|
||||
// now to append \\x%% etc
|
||||
uint8_t bytesNeeded = U8_LENGTH(ch);
|
||||
if(bytesNeeded == 0) {
|
||||
fprintf(stderr, "Illegal code point U+%X\n", ch);
|
||||
return true;
|
||||
}
|
||||
uint8_t bytes[4];
|
||||
uint8_t *s = bytes;
|
||||
size_t i = 0;
|
||||
U8_APPEND_UNSAFE(s, i, ch);
|
||||
for(size_t t = 0; t<i; t++) {
|
||||
appendByte(outstr, s[t]);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param linestr string to mutate. Already escaped into \u format.
|
||||
* @param origpos beginning, points to 'u8"'
|
||||
* @param pos end, points to "
|
||||
* @return false for no-problem, true for failure!
|
||||
*/
|
||||
bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
|
||||
size_t pos = origpos + 3;
|
||||
std::string outstr;
|
||||
outstr += (kDBLQ);
|
||||
for(;pos<endpos;pos++) {
|
||||
char c = linestr[pos];
|
||||
if(c == kSLASH) {
|
||||
char c2 = linestr[++pos];
|
||||
switch(c2) {
|
||||
case kQUOT:
|
||||
case kDBLQ:
|
||||
appendByte(outstr, c2);
|
||||
break;
|
||||
case kL_U:
|
||||
appendUtf8(outstr, linestr, pos, 4);
|
||||
break;
|
||||
case kU_U:
|
||||
appendUtf8(outstr, linestr, pos, 8);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
appendByte(outstr, c);
|
||||
}
|
||||
}
|
||||
outstr += (kDBLQ);
|
||||
|
||||
linestr.replace(origpos, (endpos-origpos+1), outstr);
|
||||
|
||||
return false; // OK
|
||||
}
|
||||
|
||||
/**
|
||||
* fix the string at the position
|
||||
@ -94,18 +186,46 @@ inline bool isCommentOrEmpty(const char* line, size_t len) {
|
||||
* true = had err
|
||||
*/
|
||||
bool fixAt(std::string &linestr, size_t pos) {
|
||||
size_t origpos = pos;
|
||||
|
||||
if(linestr[pos] != 'u') {
|
||||
fprintf(stderr, "Not a 'u'?");
|
||||
return true;
|
||||
}
|
||||
|
||||
char quote = linestr[pos+1];
|
||||
pos++; // past 'u'
|
||||
|
||||
bool utf8 = false;
|
||||
|
||||
if(linestr[pos] == '8') { // u8"
|
||||
utf8 = true;
|
||||
pos++;
|
||||
}
|
||||
|
||||
char quote = linestr[pos];
|
||||
|
||||
if(quote != '\'' && quote != '\"') {
|
||||
fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
|
||||
return true;
|
||||
}
|
||||
|
||||
if(quote == '\'' && utf8) {
|
||||
fprintf(stderr, "Cannot do u8'...'\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
pos ++;
|
||||
|
||||
//printf("u%c…%c\n", quote, quote);
|
||||
|
||||
|
||||
for(pos += 2; pos < linestr.size(); pos++) {
|
||||
if(linestr[pos] == quote) return false; // end of quote
|
||||
for(; pos < linestr.size(); pos++) {
|
||||
if(linestr[pos] == quote) {
|
||||
if(utf8) {
|
||||
return fixu8(linestr, origpos, pos); // fix u8"..."
|
||||
} else {
|
||||
return false; // end of quote
|
||||
}
|
||||
}
|
||||
if(linestr[pos] == '\\') {
|
||||
pos++;
|
||||
if(linestr[pos] == quote) continue; // quoted quote
|
||||
@ -156,19 +276,20 @@ bool fixLine(int /*no*/, std::string &linestr) {
|
||||
size_t len = linestr.size();
|
||||
|
||||
// no u' in the line?
|
||||
if(!strstr(line, "u'") && !strstr(line, "u\"")) {
|
||||
if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
|
||||
return false; // Nothing to do. No u' or u" detected
|
||||
}
|
||||
|
||||
// Quick Check: all ascii?
|
||||
if(!hasNonAscii(line, len)) {
|
||||
return false; // ASCII
|
||||
}
|
||||
// lines such as u8"\u0308" are all ASCII.
|
||||
// // Quick Check: all ascii?
|
||||
// if(!hasNonAscii(line, len)) {
|
||||
// return false; // ASCII
|
||||
// }
|
||||
|
||||
// comment or empty line?
|
||||
if(isCommentOrEmpty(line, len)) {
|
||||
return false; // Comment or just empty
|
||||
}
|
||||
// // comment or empty line?
|
||||
// if(isCommentOrEmpty(line, len)) {
|
||||
// return false; // Comment or just empty
|
||||
// }
|
||||
|
||||
// start from the end and find all u" cases
|
||||
size_t pos = len = linestr.size();
|
||||
@ -188,6 +309,14 @@ bool fixLine(int /*no*/, std::string &linestr) {
|
||||
pos--;
|
||||
}
|
||||
|
||||
// reset and find all u8" cases
|
||||
pos = len = linestr.size();
|
||||
while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
|
||||
if(fixAt(linestr, pos)) return true;
|
||||
if(pos == 0) break;
|
||||
pos--;
|
||||
}
|
||||
|
||||
//fprintf(stderr, "%d - fixed\n", no);
|
||||
return false;
|
||||
}
|
||||
|
@ -8,3 +8,9 @@ u"\U000219F2";
|
||||
u"sa\u0127\u0127a";
|
||||
u'\u6587'; u"\U000219F2";
|
||||
|
||||
"\x20\x5C\x75\x30\x33\x30\x31";
|
||||
"\x5C\x75\x30\x33\x30\x38\x20";
|
||||
"\x73\x61\x5C\x75\x30\x31\x32\x37\x5C\x75\x30\x31\x32\x37\x61";
|
||||
"\x5C\x75\x36\x35\x38\x37";
|
||||
"\x5C\x55\x30\x30\x30\x32\x31\x39\x46\x32";
|
||||
"\x73\x61\x5C\x75\x30\x31\x32\x37\x5C\x75\x30\x31\x32\x37\x61";
|
||||
|
@ -8,3 +8,9 @@ u"𡧲";
|
||||
u"saħħa";
|
||||
u'文'; u"𡧲";
|
||||
|
||||
u8" \u0301";
|
||||
u8"\u0308 ";
|
||||
u8"saħħa";
|
||||
u8"文";
|
||||
u8"𡧲";
|
||||
u8"saħ\u0127a";
|
||||
|
Loading…
Reference in New Issue
Block a user