scuffed-code/icu4c/source/i18n/unesctrn.cpp

294 lines
9.3 KiB
C++

// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/19/2001 aliu Creation.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "unesctrn.h"
#include "util.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
/**
* Special character marking the end of the spec[] array.
*/
static const UChar END = 0xFFFF;
// Unicode: "U+10FFFF" hex, min=4, max=6
static const UChar SPEC_Unicode[] = {
2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
END
};
// Java: "\\uFFFF" hex, min=4, max=4
static const UChar SPEC_Java[] = {
2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
END
};
// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
static const UChar SPEC_C[] = {
2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
END
};
// XML: "" hex, min=1, max=6
static const UChar SPEC_XML[] = {
3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
END
};
// XML10: "" dec, min=1, max=7 (not really "Hex-Any")
static const UChar SPEC_XML10[] = {
2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
END
};
// Perl: "\\x{263A}" hex, min=1, max=6
static const UChar SPEC_Perl[] = {
3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
END
};
// All: Java, C, Perl, XML, XML10, Unicode
static const UChar SPEC_Any[] = {
2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode
2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java
2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates)
3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML
2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10
3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
END
};
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
static UChar* copySpec(const UChar* spec) {
int32_t len = 0;
while (spec[len] != END) {
++len;
}
++len;
UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
// Check for memory allocation error.
if (result != NULL) {
uprv_memcpy(result, spec, (size_t)len*sizeof(result[0]));
}
return result;
}
/**
* Factory methods. Ignore the context.
*/
static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
return new UnescapeTransliterator(ID, SPEC_Unicode);
}
static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
return new UnescapeTransliterator(ID, SPEC_Java);
}
static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
return new UnescapeTransliterator(ID, SPEC_C);
}
static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
return new UnescapeTransliterator(ID, SPEC_XML);
}
static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
return new UnescapeTransliterator(ID, SPEC_XML10);
}
static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
return new UnescapeTransliterator(ID, SPEC_Perl);
}
static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
return new UnescapeTransliterator(ID, SPEC_Any);
}
/**
* Registers standard variants with the system. Called by
* Transliterator during initialization.
*/
void UnescapeTransliterator::registerIDs() {
Token t = integerToken(0);
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
}
/**
* Constructor. Takes the encoded spec array.
*/
UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
const UChar *newSpec) :
Transliterator(newID, NULL)
{
this->spec = copySpec(newSpec);
}
/**
* Copy constructor.
*/
UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
Transliterator(o) {
this->spec = copySpec(o.spec);
}
UnescapeTransliterator::~UnescapeTransliterator() {
uprv_free(spec);
}
/**
* Transliterator API.
*/
Transliterator* UnescapeTransliterator::clone() const {
return new UnescapeTransliterator(*this);
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
UBool isIncremental) const {
int32_t start = pos.start;
int32_t limit = pos.limit;
int32_t i, j, ipat;
while (start < limit) {
// Loop over the forms in spec[]. Exit this loop when we
// match one of the specs. Exit the outer loop if a
// partial match is detected and isIncremental is true.
for (j=0, ipat=0; spec[ipat] != END; ++j) {
// Read the header
int32_t prefixLen = spec[ipat++];
int32_t suffixLen = spec[ipat++];
int8_t radix = (int8_t) spec[ipat++];
int32_t minDigits = spec[ipat++];
int32_t maxDigits = spec[ipat++];
// s is a copy of start that is advanced over the
// characters as we parse them.
int32_t s = start;
UBool match = TRUE;
for (i=0; i<prefixLen; ++i) {
if (s >= limit) {
if (i > 0) {
// We've already matched a character. This is
// a partial match, so we return if in
// incremental mode. In non-incremental mode,
// go to the next spec.
if (isIncremental) {
goto exit;
}
match = FALSE;
break;
}
}
UChar c = text.charAt(s++);
if (c != spec[ipat + i]) {
match = FALSE;
break;
}
}
if (match) {
UChar32 u = 0;
int32_t digitCount = 0;
for (;;) {
if (s >= limit) {
// Check for partial match in incremental mode.
if (s > start && isIncremental) {
goto exit;
}
break;
}
UChar32 ch = text.char32At(s);
int32_t digit = u_digit(ch, radix);
if (digit < 0) {
break;
}
s += U16_LENGTH(ch);
u = (u * radix) + digit;
if (++digitCount == maxDigits) {
break;
}
}
match = (digitCount >= minDigits);
if (match) {
for (i=0; i<suffixLen; ++i) {
if (s >= limit) {
// Check for partial match in incremental mode.
if (s > start && isIncremental) {
goto exit;
}
match = FALSE;
break;
}
UChar c = text.charAt(s++);
if (c != spec[ipat + prefixLen + i]) {
match = FALSE;
break;
}
}
if (match) {
// At this point, we have a match
UnicodeString str(u);
text.handleReplaceBetween(start, s, str);
limit -= s - start - str.length();
// The following break statement leaves the
// loop that is traversing the forms in
// spec[]. We then parse the next input
// character.
break;
}
}
}
ipat += prefixLen + suffixLen;
}
if (start < limit) {
start += U16_LENGTH(text.char32At(start));
}
}
exit:
pos.contextLimit += limit - pos.limit;
pos.limit = limit;
pos.start = start;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
//eof