scuffed-code/tools/unicode/c/genregexcasing/genregexcasing.cpp
2014-05-02 22:02:59 +00:00

141 lines
4.0 KiB
C++

/*
*******************************************************************************
*
* Copyright (C) 2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*/
// file name: genregexcasing.cpp
//
// Program to generate the casing data for use by ICU regular expressions.
// The data declarations output when running this program are to be copied
// into the file i18n/regexcmp.h
//
// See the function RegexCompile::findCaseInsensitiveStarters() for more explanation.
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "iostream"
#include <map>
#include <set>
#include <string>
#include <vector>
using namespace std;
std::string sstring(const UnicodeString &us) {
string retString;
us.toUTF8String(retString);
return retString;
}
int main() {
std::map<UChar32, std::set<UChar32>> cmap;
for (UChar32 cp = 0; cp<=0x10ffff; cp++) {
UnicodeSet s(cp, cp);
s.closeOver(USET_CASE_INSENSITIVE);
UnicodeSetIterator setIter(s);
while (setIter.next()) {
if (!setIter.isString()) {
continue;
}
const UnicodeString &str = setIter.getString();
cout << "Got a string for \"" << sstring(UnicodeString(cp)) << "\" [\\u" << hex << cp << "]\n";
cout << " \"" << sstring(str) << "\" [";
for (int32_t j=0; j<str.length(); j=str.moveIndex32(j, 1)) {
cout << hex << "\\u" << str.char32At(j) << " ";
}
cout << "]" << endl;
UChar32 c32 = str.char32At(0);
if (s.contains(c32)) {
cout << " Set contains first char.\n";
}
cmap[c32].insert(cp);
}
}
std::cout << "Iterating the map.\n";
for (const auto &mapPair: cmap) {
UChar32 cp = mapPair.first;
std::cout << "key: \"" << sstring(UnicodeString(cp)) << "\" \\u" << cp << " : [";
for (UChar32 valCP: mapPair.second) {
std::cout << "\"" << sstring(UnicodeString(valCP)) << "\" \\u" << valCP << " ";
}
std::cout << "]\n";
}
//
// Create the data arrays to be pasted into regexcmp.cpp
//
std::cout << "\n\nCopy the lines below into the file i18n/regexcmp.cpp.\n\n";
std::cout << "// Machine Generated Data. Do not hand edit.\n";
UnicodeString outString;
struct Item {
UChar32 fCP = 0;
int16_t fStrIndex = 0;
int16_t fCount = 0;
};
std::vector<Item> data;
for (const auto &mapPair: cmap) {
Item dataForCP;
dataForCP.fCP = mapPair.first;
dataForCP.fStrIndex = outString.length();
for (UChar32 valCP: mapPair.second) {
outString.append(valCP);
dataForCP.fCount++;
}
data.push_back(dataForCP);
}
std::cout << " static const UChar32 RECaseFixCodePoints[] = {" ;
int items=0;
for (const Item &d: data) {
if (items++ % 10 == 0) {
std::cout << "\n ";
}
std::cout << "0x" << d.fCP << ", ";
}
std::cout << "0x110000};\n\n";
std::cout << " static const int16_t RECaseFixStringOffsets[] = {";
items = 0;
for (const Item &d: data) {
if (items++ % 10 == 0) {
std::cout << "\n ";
}
std::cout << "0x" << d.fStrIndex << ", ";
}
std::cout << "0};\n\n";
std::cout << " static const int16_t RECaseFixCounts[] = {";
items = 0;
for (const Item &d: data) {
if (items++ % 10 == 0) {
std::cout << "\n ";
}
std::cout << "0x" << d.fCount << ", ";
}
std::cout << "0};\n\n";
std::cout << " static const UChar RECaseFixData[] = {";
for (int i=0; i<outString.length(); i++) {
if (i % 10 == 0) {
std::cout << "\n ";
}
std::cout << "0x" << outString.charAt(i) << ", ";
}
std::cout << "0};\n\n";
return 0;
}