1870215131
X-SVN-Rev: 40527
2070 lines
64 KiB
C++
2070 lines
64 KiB
C++
// © 2017 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
#include "sortedlines.h"
|
|
|
|
static int codePointCmp(const void *a, const void *b) {
|
|
return u_strcmp((*(Line **)a)->name, (*(Line **)b)->name);
|
|
}
|
|
|
|
SortedLines::SortedLines(const UnicodeSet &set, const UnicodeSet &excludeBounds, const StrengthProbe &probe,
|
|
UPrinter *logger, UPrinter *debug) :
|
|
toSort(NULL),
|
|
toSortCapacity(0),
|
|
lines(NULL),
|
|
size(0),
|
|
capacity(0),
|
|
repertoire(set),
|
|
excludeBounds(excludeBounds),
|
|
probe(probe),
|
|
first(NULL),
|
|
last(NULL),
|
|
logger(logger),
|
|
debug(debug),
|
|
contractionsTable(NULL),
|
|
duplicators(NULL),
|
|
maxExpansionPrefixSize(0),
|
|
wordSort(FALSE),
|
|
frenchSecondary(FALSE),
|
|
upperFirst(FALSE),
|
|
sortkeys(NULL),
|
|
sortkeyOffset(0)
|
|
{
|
|
memset(UB, 0, sizeof(UB));
|
|
int32_t i = 0;
|
|
for(i = 0; i < UCOL_OFF; i++) {
|
|
UB[i] = ∅
|
|
}
|
|
init();
|
|
}
|
|
|
|
SortedLines::~SortedLines()
|
|
{
|
|
delete[] lines;
|
|
if(sortkeys) {
|
|
delete[] sortkeys;
|
|
}
|
|
if(toSort) {
|
|
delete[] toSort;
|
|
}
|
|
if(contractionsTable) {
|
|
delete contractionsTable;
|
|
}
|
|
if(duplicators) {
|
|
delete duplicators;
|
|
}
|
|
}
|
|
|
|
void
|
|
SortedLines::getBounds(UErrorCode &status) {
|
|
// first sort through the set
|
|
debug->log(toString(), TRUE);
|
|
int32_t i = 0, j = 0;
|
|
UColAttributeValue strength = UCOL_OFF;
|
|
for(i = 0; i < size; i++) {
|
|
if(toSort[i]->strengthFromEmpty < strength) {
|
|
if(i && strength < UCOL_OFF) {
|
|
//u_strcpy(UB[strength], toSort[i-1]->name);
|
|
j = 1;
|
|
while(excludeBounds.contains(UnicodeString(toSort[i-j]->name, toSort[i-j]->len))) {
|
|
j++;
|
|
}
|
|
UB[strength] = toSort[i-j];
|
|
}
|
|
strength = toSort[i]->strengthFromEmpty;
|
|
if(strength == UCOL_PRIMARY) {
|
|
probe.SE = toSort[i]->name[0];
|
|
}
|
|
}
|
|
}
|
|
//u_strcpy(UB[strength], toSort[size-1]->name);
|
|
// a different solution for bounds: go from end and see if the guys on the top
|
|
// cause duplication for things
|
|
UChar dupch[] = { 0x0020, 0x0030, 0x0042, 0x0051, 0x0062, 0x0071, 0x0391, 0x0396, 0x03b1, 0x03b6 };
|
|
j = 1;
|
|
Line dup;
|
|
Line bound;
|
|
int32_t dups = 0;
|
|
while(j < size) {
|
|
dups = 0;
|
|
for(i = 0; i < sizeof(dupch)/sizeof(dupch[0]); i++) {
|
|
dup.setTo(dupch[i]);
|
|
dup.append(dupch[i]);
|
|
bound.setTo(dupch[i]);
|
|
bound.append(toSort[size-j]->name, toSort[size-j]->len);
|
|
if(probe.getStrength(dup, bound) >= UCOL_IDENTICAL) {
|
|
dups++;
|
|
}
|
|
}
|
|
if(dups == 0) {
|
|
break;
|
|
} else {
|
|
if(!duplicators) {
|
|
duplicators = new Hashtable();
|
|
}
|
|
duplicators->put(UnicodeString(toSort[size-j]->name, toSort[size-j]->len), &toSort[size-j], status);
|
|
debug->log(toSort[size-j]->toString());
|
|
debug->log(" is not good enough to be an upper bound\n");
|
|
j++;
|
|
}
|
|
}
|
|
if(j == size) {
|
|
debug->log("Oi! I'm hallucinating. Will use the first upper bound");
|
|
delete duplicators;
|
|
duplicators = NULL;
|
|
j = 1;
|
|
}
|
|
/*
|
|
j = 1;
|
|
while(excludeBounds.contains(UnicodeString(toSort[size-j]->name, toSort[size-j]->len))) {
|
|
j++;
|
|
}
|
|
*/
|
|
UB[strength] = toSort[size-j];
|
|
for(i = 0; i < UCOL_OFF; i++) {
|
|
if(UB[i]) {
|
|
//debug->log(UB[i], TRUE);
|
|
debug->log(UB[i]->toString(TRUE), TRUE);
|
|
}
|
|
}
|
|
}
|
|
|
|
// classifies repertoire according to the strength of their difference
|
|
// from the empty string
|
|
void
|
|
SortedLines::classifyRepertoire() {
|
|
UColAttributeValue strongestStrengthFromEmpty = UCOL_OFF;
|
|
int32_t lastChange = 0;
|
|
int32_t i = 0, j = 0;
|
|
while(i < size) // && probe.distanceFromEmptyString(*toSort[i]) > UCOL_PRIMARY)
|
|
{
|
|
toSort[i]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i]);
|
|
if(toSort[i]->strengthFromEmpty < strongestStrengthFromEmpty) {
|
|
strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
|
|
lastChange = i;
|
|
} else if (toSort[i]->strengthFromEmpty > strongestStrengthFromEmpty) {
|
|
// there is a problem in detection. Most probably a quaternary.
|
|
// why don't we try to interpolate
|
|
UColAttributeValue nextStrength = UCOL_OFF;
|
|
UColAttributeValue prevStrength = UCOL_OFF;
|
|
UColAttributeValue st = UCOL_OFF;
|
|
|
|
logger->log("Interpolating to get the distance from empty for Line ");
|
|
logger->log(toSort[i]->toString(TRUE), TRUE);
|
|
|
|
if(i) {
|
|
st = probe.getStrength(*toSort[i-1], *toSort[i]);
|
|
if(st == UCOL_OFF) {
|
|
logger->log("Cannot deduce distance from empty using previous element. Something is very wrong! Line:");
|
|
logger->log(toSort[i]->toString(TRUE), TRUE);
|
|
} else if(st == UCOL_IDENTICAL || st >= toSort[i-1]->strengthFromEmpty) {
|
|
prevStrength = toSort[i-1]->strengthFromEmpty;
|
|
} else if(st < toSort[i-1]->strengthFromEmpty) {
|
|
prevStrength = st;
|
|
}
|
|
toSort[i]->strengthFromEmpty = prevStrength;
|
|
}
|
|
if(i < size-2) {
|
|
toSort[i+1]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i+1]);
|
|
st = probe.getStrength(*toSort[i+1], *toSort[i]);
|
|
if(st == UCOL_OFF) {
|
|
logger->log("Cannot deduce distance from empty using next element. Something is very wrong! Line:");
|
|
logger->log(toSort[i]->toString(TRUE), TRUE);
|
|
} else if(st == UCOL_IDENTICAL || st < toSort[i+1]->strengthFromEmpty) {
|
|
nextStrength = toSort[i+1]->strengthFromEmpty;
|
|
} else if(st >= toSort[i+1]->strengthFromEmpty) {
|
|
nextStrength = st;
|
|
}
|
|
if(i) {
|
|
if(prevStrength != nextStrength) {
|
|
logger->log("Inconsistent results from interpolation! Results will most likely be wrong\n");
|
|
}
|
|
}
|
|
toSort[i]->strengthFromEmpty = nextStrength;
|
|
}
|
|
/*
|
|
UColAttributeValue problemStrength = UCOL_PRIMARY;
|
|
for(j = lastChange; j < i ; j++) {
|
|
if(toSort[j]->strength > problemStrength) {
|
|
problemStrength = toSort[j]->strength;
|
|
}
|
|
}
|
|
for(j = lastChange; j < i ; j++) {
|
|
toSort[j]->strengthFromEmpty = problemStrength;
|
|
}
|
|
strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
|
|
lastChange = i;
|
|
debug->log("Problem detected in distances from empty. Most probably word sort is on\n");
|
|
*/
|
|
wordSort = TRUE;
|
|
}
|
|
i++;
|
|
}
|
|
debug->log("Distances from empty string\n");
|
|
debug->log(toStringFromEmpty(), TRUE);
|
|
}
|
|
|
|
void
|
|
SortedLines::analyse(UErrorCode &status) {
|
|
frenchSecondary = probe.isFrenchSecondary(status);
|
|
if(U_FAILURE(status)) {
|
|
logger->log("Test for French secondary failed. Bailing out!\n");
|
|
return;
|
|
}
|
|
logger->log("French secondary value is %i\n", frenchSecondary, frenchSecondary);
|
|
upperFirst = probe.isUpperFirst(status);
|
|
if(U_FAILURE(status)) {
|
|
logger->log("Test for upper first failed. Bailing out!\n");
|
|
return;
|
|
}
|
|
logger->log("upper first value is %i\n", upperFirst, upperFirst);
|
|
sort(TRUE, TRUE);
|
|
classifyRepertoire();
|
|
getBounds(status);
|
|
//sort(TRUE, TRUE);
|
|
addContractionsToRepertoire(status);
|
|
//sort(TRUE, TRUE);
|
|
debug->log("\n*** Order after detecting contractions\n\n");
|
|
calculateSortKeys();
|
|
debug->log(toPrettyString(FALSE, TRUE), TRUE);
|
|
detectExpansions();
|
|
}
|
|
|
|
void SortedLines::init()
|
|
{
|
|
size = repertoire.size();
|
|
capacity = 5*size;
|
|
lines = new Line[capacity];
|
|
init(repertoire, lines);
|
|
}
|
|
|
|
void SortedLines::init(UnicodeSet &rep, Line *lin)
|
|
{
|
|
|
|
UnicodeSetIterator exemplarUSetIter(rep);
|
|
int32_t size = 0;
|
|
|
|
while(exemplarUSetIter.next()) {
|
|
Line *currLine = lin+size;
|
|
if(exemplarUSetIter.isString()) { // process a string
|
|
currLine->setTo(exemplarUSetIter.getString());
|
|
} else { // process code point
|
|
currLine->setTo(exemplarUSetIter.getCodepoint());
|
|
}
|
|
currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
|
|
//currLine->index = size;
|
|
size++;
|
|
}
|
|
}
|
|
|
|
void
|
|
SortedLines::setSortingArray(Line **sortingArray, Line *elements, int32_t sizeToSort) {
|
|
int32_t i = 0;
|
|
for(i = 0; i < sizeToSort; i++) {
|
|
sortingArray[i] = &elements[i];
|
|
}
|
|
}
|
|
|
|
int32_t
|
|
SortedLines::setSortingArray(Line **sortingArray, Hashtable *table) {
|
|
int32_t size = table->count();
|
|
int32_t hashIndex = -1;
|
|
const UHashElement *hashElement = NULL;
|
|
int32_t count = 0;
|
|
while((hashElement = table->nextElement(hashIndex)) != NULL) {
|
|
sortingArray[count++] = (Line *)hashElement->value.pointer;
|
|
}
|
|
return size;
|
|
}
|
|
|
|
void
|
|
SortedLines::sort(Line **sortingArray, int32_t sizeToSort, UBool setStrengths, UBool link) {
|
|
int32_t i = 0;
|
|
int32_t equalStart = 0;
|
|
UColAttributeValue equalStrength = UCOL_OFF;
|
|
|
|
qsort(sortingArray, sizeToSort, sizeof(Line *), probe.comparer);
|
|
|
|
if(setStrengths) { // analyze strengths
|
|
for(i = 1; i < sizeToSort; i++) {
|
|
sortingArray[i]->strength = probe.getStrength(*sortingArray[i-1], *sortingArray[i]);
|
|
}
|
|
// for equal guys, do the code point ordering
|
|
|
|
i = 1;
|
|
while(i < sizeToSort)
|
|
{
|
|
if(sortingArray[i]->strength == UCOL_IDENTICAL) {
|
|
equalStart = i - 1;
|
|
equalStrength = sortingArray[equalStart]->strength;
|
|
sortingArray[equalStart]->strength = UCOL_IDENTICAL;
|
|
while(i < sizeToSort && sortingArray[i]->strength == UCOL_IDENTICAL) {
|
|
i++;
|
|
}
|
|
qsort(sortingArray+equalStart, i-equalStart, sizeof(Line *), codePointCmp);
|
|
sortingArray[equalStart]->strength = equalStrength;
|
|
} else {
|
|
i++;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(link) { // do the linking
|
|
for(i = 0; i < sizeToSort - 1; i++) {
|
|
Line *curr = *(sortingArray+i);
|
|
curr->next = *(sortingArray+i+1);
|
|
(*(sortingArray+i+1))->previous = curr;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
SortedLines::sort(UBool setStrengths, UBool link) {
|
|
if(toSortCapacity < size || !toSort) {
|
|
if(toSort) {
|
|
delete[] toSort;
|
|
}
|
|
toSort = new Line*[size*2];
|
|
toSortCapacity = size*2;
|
|
}
|
|
|
|
setSortingArray(toSort, lines, size);
|
|
sort(toSort, size, setStrengths, link);
|
|
|
|
first = last = NULL;
|
|
|
|
if(link) { // do the linking
|
|
first = *toSort;
|
|
last = *(toSort+size-1);
|
|
}
|
|
}
|
|
|
|
void
|
|
SortedLines::updateBounds(UnicodeSet &set) {
|
|
Line line;
|
|
UnicodeString s1;
|
|
UnicodeSetIterator it1(set);
|
|
while(it1.next()) {
|
|
if(!debug->isOn()) {
|
|
logger->log(".");
|
|
}
|
|
if(it1.isString()) { // process a string
|
|
s1.setTo(it1.getString());
|
|
} else { // process code point
|
|
s1.setTo(it1.getCodepoint());
|
|
}
|
|
//line.setTo(s1);
|
|
UColAttributeValue strength = probe.distanceFromEmptyString(s1);
|
|
if(probe.compare(UnicodeString(UB[strength]->name), s1) < 0) {
|
|
// TODO: leak here - fixit!
|
|
UB[strength] = new Line(s1);
|
|
//u_strcpy(UB[strength], s1.getTerminatedBuffer());
|
|
}
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
void SortedLines::addAll(Line* toAdd, int32_t toAddSize)
|
|
{
|
|
if(size+toAddSize > capacity) {
|
|
int32_t doGrowingBreakpoint = 0;
|
|
// we need to do growing here
|
|
}
|
|
int32_t i = 0;
|
|
|
|
for(i = 0; i < toAddSize; i++) {
|
|
lines[size+i] = toAdd[i];
|
|
}
|
|
size += toAddSize;
|
|
}
|
|
|
|
void SortedLines::setDistancesFromEmpty(Line* array, int32_t arraySize)
|
|
{
|
|
int32_t i = 0;
|
|
for(i = 0; i < arraySize; i++) {
|
|
array[i].strengthFromEmpty = probe.distanceFromEmptyString(array[i]);
|
|
}
|
|
}
|
|
|
|
|
|
// adds contractions in to repertoire
|
|
int32_t SortedLines::addContractionsToRepertoire(UErrorCode &status)
|
|
{
|
|
logger->log("\n*** Detecting contractions\n\n");
|
|
contractionsTable = new Hashtable();
|
|
int32_t noConts = 0;
|
|
int32_t allocateSize = 50*size;
|
|
// first check for simple contractions
|
|
Line* delta = new Line[allocateSize];
|
|
Line** deltaSorted = new Line*[allocateSize];
|
|
Line* lesserToAddTo = new Line[allocateSize];
|
|
Line* newDelta = new Line[allocateSize];
|
|
Line** newDeltaSorted = new Line*[allocateSize];
|
|
Line* deltaP = delta;
|
|
Line** deltaPP = deltaSorted;
|
|
Line* newDeltaP = newDelta;
|
|
int32_t deltaSize = 0, lesserToAddToSize = 0, newDeltaSize = 0;
|
|
logger->log("++ Contraction detection generation 0\n");
|
|
noConts = detectContractions(toSort, size, toSort, size,
|
|
delta, deltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
|
|
setSortingArray(deltaSorted, delta, deltaSize);
|
|
sort(deltaSorted, deltaSize, TRUE);
|
|
|
|
setDistancesFromEmpty(delta, deltaSize);
|
|
int32_t deltaPSize = deltaSize;
|
|
//updateBounds(delta);
|
|
|
|
int32_t generation = 0;
|
|
// if we found any, we have to try multiple contractions
|
|
// However, we want to prevent the contractions explosion
|
|
// if the number of simple contractions is greater than the
|
|
// starting size, chances are that we either have an algorithmic
|
|
// contraction (like iteration marks on w2k) or something
|
|
// is seriosly wrong.
|
|
if(deltaPSize < size/2) {
|
|
while (deltaPSize && generation < 1) {
|
|
generation++;
|
|
logger->log("\n++ Contraction detection generation %i\n", generation, generation);
|
|
// find more, but avoid testing the combinations we already have
|
|
noConts += detectContractions(toSort, size, deltaPP, deltaPSize,
|
|
newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
|
|
noConts += detectContractions(deltaPP, deltaPSize, toSort, size,
|
|
newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
|
|
calculateSortKeys();
|
|
|
|
addAll(deltaP, deltaPSize);
|
|
setSortingArray(toSort, lines, size);
|
|
sort(TRUE, TRUE);
|
|
setSortingArray(newDeltaSorted, newDeltaP, newDeltaSize);
|
|
sort(newDeltaSorted, newDeltaSize, TRUE);
|
|
|
|
// if no new ones, bail
|
|
//if (newDeltaSize == 0) break;
|
|
|
|
deltaPSize = newDeltaSize;
|
|
newDeltaSize = 0;
|
|
if(deltaP == delta) {
|
|
deltaP = newDelta;
|
|
deltaPP = newDeltaSorted;
|
|
newDeltaP = delta;
|
|
} else {
|
|
deltaP = delta;
|
|
deltaPP = deltaSorted;
|
|
newDeltaP = newDelta;
|
|
}
|
|
setDistancesFromEmpty(deltaP, deltaPSize);
|
|
}
|
|
}
|
|
status = U_ZERO_ERROR;
|
|
// add stuff from the last batch
|
|
addAll(deltaP, deltaPSize);
|
|
|
|
// warning: we don't add the lesser ones in recursively, since they will
|
|
// infinitely loop
|
|
setDistancesFromEmpty(lesserToAddTo, lesserToAddToSize);
|
|
addAll(lesserToAddTo, lesserToAddToSize);
|
|
setSortingArray(toSort, lines, size);
|
|
sort(TRUE, TRUE);
|
|
|
|
delete[] deltaSorted;
|
|
delete[] delta;
|
|
delete[] lesserToAddTo;
|
|
delete[] newDeltaSorted;
|
|
delete[] newDelta;
|
|
return noConts;
|
|
}
|
|
|
|
|
|
int32_t SortedLines::detectContractions(Line **firstRep, int32_t firstSize,
|
|
Line **secondRep, int32_t secondSize,
|
|
Line *toAddTo, int32_t &toAddToSize,
|
|
Line *lesserToAddTo, int32_t &lesserToAddToSize,
|
|
int32_t capacity, UErrorCode &status)
|
|
{
|
|
int32_t noConts = 0;
|
|
int i = 0, j = 0, k = 0;
|
|
Line lower, upper, trial, toAdd, helper;
|
|
UChar32 firstStart, firstEnd, secondStart;
|
|
UChar NFCTrial[256];
|
|
int32_t NFCTrialLen = 0;
|
|
UBool thai;
|
|
i = -1;
|
|
while(i < firstSize-1 && U_SUCCESS(status)) {
|
|
i++;
|
|
if(!debug->isOn()) {
|
|
logger->log("\rTesting %05i/%05i. Found %05i conts.", i, firstSize, noConts);
|
|
}
|
|
U16_GET(firstRep[i]->name, 0, 0, firstRep[i]->len, firstStart);
|
|
if(uscript_getScript(firstStart, &status) == USCRIPT_HAN || firstRep[i]->strengthFromEmpty > UCOL_PRIMARY) //UCOL_TERTIARY)
|
|
{
|
|
continue;
|
|
}
|
|
lower = *firstRep[i];
|
|
for(j = 0; j < secondSize; j++) {
|
|
if(noConts == capacity) {
|
|
return noConts;
|
|
}
|
|
U16_GET(secondRep[j]->name, 0, 0, secondRep[j]->len, secondStart);
|
|
if(firstStart == 0x41 && secondStart == 0x308) {
|
|
int32_t putBreakPointHere = 0;
|
|
}
|
|
if(uscript_getScript(secondStart, &status) == USCRIPT_HAN) // || secondRep[j]->strengthFromEmpty > UCOL_TERTIARY)
|
|
{
|
|
continue;
|
|
}
|
|
if(duplicators && duplicators->get(UnicodeString(secondRep[j]->name, secondRep[j]->len)) != NULL) {
|
|
debug->log("Skipping duplicator ");
|
|
debug->log(secondRep[j]->toString(), TRUE);
|
|
continue;
|
|
}
|
|
|
|
if(firstRep[i]->name[0] == 0x61 && secondRep[j]->name[0] == 0x308) {
|
|
int32_t putBreakpointhere = 0;
|
|
}
|
|
upper.setToConcat(firstRep[i], UB[UCOL_PRIMARY]);
|
|
//upper.setToConcat(firstRep[i], UB[secondRep[j]->strengthFromEmpty]);
|
|
toAdd.setToConcat(firstRep[i], secondRep[j]);
|
|
U16_GET(firstRep[i]->name, 0, firstRep[i]->len-1, firstRep[i]->len, firstEnd);
|
|
if((thai = u_hasBinaryProperty(firstEnd, UCHAR_LOGICAL_ORDER_EXCEPTION))) {
|
|
// this means that the lower is single reordering character
|
|
// if we do the lower test without taking this into account,
|
|
// we'll comparing the secondRep directly to Thai. We add UB[UCOL_PRIMARY] to
|
|
// end of lower and in the middle of trial, so we will have
|
|
// lower = Thai + UB, trial Thai + UB + x, resolving to
|
|
// UB + Thai vs UB + Thai + x.
|
|
// for upper bound, we do the similar, so we have
|
|
// upper = Thai + UB + UB, trial = Thai + UB + x,
|
|
// resolving to UB + Thai + UB vs UB + Thai + x
|
|
if(secondRep[j]->firstCC) {
|
|
UChar32 UBChar;
|
|
U16_GET(UB[UCOL_SECONDARY]->name, 0, 0, UB[UCOL_SECONDARY]->len, UBChar);
|
|
if(secondRep[j]->firstCC > u_getCombiningClass(UBChar)) {
|
|
continue;
|
|
}
|
|
}
|
|
upper = *firstRep[i];
|
|
upper.append(*UB[UCOL_PRIMARY]);
|
|
//upper.append(*UB[secondRep[j]->strengthFromEmpty]);
|
|
upper.append(*UB[UCOL_PRIMARY]);
|
|
lower.append(*UB[UCOL_PRIMARY]);
|
|
trial = *firstRep[i];
|
|
trial.append(*UB[UCOL_PRIMARY]);
|
|
trial.append(*secondRep[j]);
|
|
} else if((firstRep[i]->lastCC > secondRep[j]->firstCC && secondRep[j]->firstCC && !frenchSecondary)
|
|
|| (firstRep[i]->firstCC < secondRep[j]->lastCC && firstRep[i]->firstCC && frenchSecondary)) {
|
|
// Skip because normalization will reorder
|
|
// there will be a chance to check this again, since if we
|
|
// try a+b, we will also try b+a
|
|
continue;
|
|
} else if(frenchSecondary && (firstRep[i]->strengthFromEmpty > UCOL_PRIMARY && secondRep[j]->strengthFromEmpty > UCOL_PRIMARY)) {
|
|
continue;
|
|
}else if(firstRep[i]->lastCC && secondRep[j]->firstCC && frenchSecondary) {
|
|
trial.setToConcat(secondRep[j], firstRep[i]);
|
|
} else {
|
|
trial.setToConcat(firstRep[i], secondRep[j]);
|
|
}
|
|
// Now let's check the trial. The problem is that when you combine characters,
|
|
// you can end up with concatenation that is unknown for the examined API.
|
|
NFCTrialLen = unorm_normalize(trial.name, trial.len, UNORM_NFC, 0, NFCTrial, 256, &status);
|
|
if((u_strcmp(trial.name, NFCTrial) == 0) || u_strFindLast(NFCTrial, NFCTrialLen, secondRep[j]->name, secondRep[j]->len)) {
|
|
if(secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) {
|
|
continue;
|
|
}
|
|
}
|
|
UChar32 c;
|
|
U16_GET(NFCTrial, 0, 0, NFCTrialLen, c);
|
|
helper.setTo(c);
|
|
if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
|
|
continue;
|
|
}
|
|
if(NFCTrialLen > 1) {
|
|
U16_GET(NFCTrial, 0, NFCTrialLen-1, NFCTrialLen, c);
|
|
helper.setTo(c);
|
|
if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (probe.compare(lower, trial) >= 0) { // if lower is bigger than trial
|
|
// this might be ok, but I'm having doubts. Here is an additional check:
|
|
if(firstRep[i]->len == 1 || secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) {
|
|
// I'm basically saying that I'll add this kind of contraction for cases where I combine
|
|
// one letter with an accent OR when I'm combining more than one symbol with a letter.
|
|
noteContraction("L", lesserToAddTo, lesserToAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
}
|
|
}
|
|
else if (probe.compare(trial, upper) > 0) { // trial is bigger than upper??
|
|
noteContraction("U", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
}
|
|
#if 0
|
|
else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY)
|
|
{
|
|
Line expansionLine;
|
|
if(getExpansionLine(trial, *firstRep[i], *secondRep[j], expansionLine) &&
|
|
expansionLine.len && !(expansionLine == *secondRep[j])) {
|
|
noteContraction("D", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
}
|
|
}
|
|
#endif
|
|
else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && probe.getStrength(lower, trial) < secondRep[j]->strengthFromEmpty) {
|
|
noteContraction("D1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
}
|
|
else if (firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && secondRep[j]->strengthFromEmpty == UCOL_PRIMARY)
|
|
{
|
|
// I have added an additional check. The checks versus upper and lower bound should be sufficient
|
|
// when the right side is a combining mark. There might be a reordering of combining marks, but
|
|
// that should be already visible in their order.
|
|
// compare the sequence
|
|
// Y- <? Y <? Y+
|
|
// and
|
|
// XY- <? XY <? XY+
|
|
Line xym, xyp, xy;
|
|
UBool xymIsContraction = FALSE, toAddIsContraction = FALSE;
|
|
if(j) {
|
|
if(((!secondRep[j-1]->firstCC || firstRep[i]->lastCC < secondRep[j-1]->firstCC) && !frenchSecondary)
|
|
||((!firstRep[i]->firstCC || firstRep[i]->firstCC > secondRep[j-1]->lastCC) && frenchSecondary)) {
|
|
xym.setToConcat(firstRep[i], secondRep[j-1]);
|
|
toAdd.strength = probe.getStrength(xym, toAdd);
|
|
if(secondRep[j]->strength != toAdd.strength) {
|
|
// there is possibility that either xym or xy are contractions
|
|
// There are two situations:
|
|
// xym > xy or xym <n xy and ym <k y but n != k
|
|
// if they are reordered, we are going to see if each of them
|
|
// is further reordered
|
|
if(toAdd.strength == UCOL_OFF) {
|
|
// check whether toAdd shifted more down
|
|
k = j - 2;
|
|
while(k>=0 && secondRep[k]->strength > secondRep[j]->strength) {
|
|
k--;
|
|
}
|
|
while(!toAddIsContraction && k>=0) {
|
|
xyp.setToConcat(firstRep[i], secondRep[k]);
|
|
if(contractionsTable->get(UnicodeString(xyp.name, xyp.len)) != NULL) {
|
|
k--;
|
|
continue;
|
|
}
|
|
if(probe.compare(xyp, xym) >= 0) {
|
|
// xyp looks like a contraction
|
|
noteContraction("!1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
toAddIsContraction = TRUE;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
// first let's see if xym has moved beyond
|
|
if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
|
|
k = j+1;
|
|
// ignore weaker strengths
|
|
while(k < secondSize && secondRep[k]->strength > secondRep[j]->strength) {
|
|
k++;
|
|
}
|
|
// check if we skipped the following guy
|
|
if(k < secondSize) {
|
|
xyp.setToConcat(firstRep[i], secondRep[k]);
|
|
if(probe.compare(xyp, xym) <= 0) {
|
|
// xyp looks like a contraction
|
|
noteContraction("!2", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
|
|
xymIsContraction = TRUE;
|
|
}
|
|
}
|
|
} else {
|
|
xymIsContraction = TRUE;
|
|
}
|
|
// if they have reordered, but none has moved, then we add them both
|
|
// and hope for the best
|
|
if(!xymIsContraction && !toAddIsContraction) {
|
|
// it is possible that there is an NFC version version of one of the
|
|
// strings. If we have XY > XZ, but NFC(XZ) = W and X < W, we might have
|
|
// have a false contraction.
|
|
trial.len = unorm_normalize(toAdd.name, toAdd.len, UNORM_NFC, 0, trial.name, 25, &status);
|
|
//UColAttributeValue strength = probe.getStrength(*firstRep[i], trial);
|
|
if(trial == toAdd) {
|
|
noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
|
|
noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
} else {
|
|
noteContraction("!4", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
}
|
|
}
|
|
} else { // only the strength has changed
|
|
// check whether the previous is contraction and if not, add the current
|
|
if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
|
|
noteContraction("!5", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(thai) { // restore lower
|
|
lower = *firstRep[i];
|
|
}
|
|
}
|
|
}
|
|
return noConts;
|
|
}
|
|
|
|
void
|
|
SortedLines::noteContraction(const char* msg, Line *toAddTo, int32_t &toAddToSize, Line *left, Line *right, int32_t &noConts, UErrorCode &status)
|
|
{
|
|
Line toAdd;
|
|
toAdd.setToConcat(left, right);
|
|
toAdd.left = left;
|
|
toAdd.right = right;
|
|
// if we're adding an accent to an existing contraction, we want to check
|
|
#if 0
|
|
Line test, trial1, trial2;
|
|
if(right->strengthFromEmpty > UCOL_PRIMARY) {
|
|
if(left->right && left->right->previous && left->right->next) {
|
|
test.setToConcat(left->left, left->right->previous);
|
|
trial1.setToConcat(&test, right);
|
|
|
|
test.setToConcat(left->left, left->right->next);
|
|
trial2.setToConcat(&test, right);
|
|
if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
|
|
// this means that the contraction has been broken by the newly added accent
|
|
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
|
|
debug->log("Con -");
|
|
debug->log(msg);
|
|
debug->log(toAdd.toString(FALSE), TRUE);
|
|
return;
|
|
}
|
|
} else {
|
|
if(right->previous && right->next) {
|
|
trial1.setToConcat(left, right->previous);
|
|
trial2.setToConcat(left, right->next);
|
|
if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
|
|
// this means that the contraction has been broken by the newly added accent
|
|
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
|
|
debug->log("Con -");
|
|
debug->log(msg);
|
|
debug->log(toAdd.toString(FALSE), TRUE);
|
|
return;
|
|
}
|
|
}
|
|
if(left->previous && left->next) {
|
|
trial1.setToConcat(left->previous, right);
|
|
trial2.setToConcat(left->next, right);
|
|
if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
|
|
// this means that the contraction has been broken by the newly added accent
|
|
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
|
|
debug->log("Con -");
|
|
debug->log(msg);
|
|
debug->log(toAdd.toString(FALSE), TRUE);
|
|
return;
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
if(right->right && right->right->strengthFromEmpty > UCOL_PRIMARY && right->left->previous && right->left->next) { // maybe we already had a contraction with an accent
|
|
test.setToConcat(right->left->previous, right->right);
|
|
trial1.setToConcat(left, &test);
|
|
test.setToConcat(right->left->next, right->right);
|
|
trial2.setToConcat(left, &test);
|
|
if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
|
|
// this means that the contraction has been broken by the newly added accent
|
|
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
|
|
debug->log("Con -");
|
|
debug->log(msg);
|
|
debug->log(toAdd.toString(FALSE), TRUE);
|
|
return;
|
|
}
|
|
}
|
|
#endif
|
|
if(contractionsTable->get(UnicodeString(toAdd.name, toAdd.len)) == NULL) {
|
|
if(probe.distanceFromEmptyString(toAdd) <= UCOL_TERTIARY) {
|
|
toAddTo[toAddToSize++] = toAdd;
|
|
contractionsTable->put(UnicodeString(toAdd.name, toAdd.len), &toAdd, status);
|
|
noConts++;
|
|
debug->log(msg);
|
|
debug->log(" Con + ");
|
|
debug->log(toAdd.toString(FALSE), TRUE);
|
|
|
|
if(!left->sortKey) {
|
|
calculateSortKey(*left);
|
|
}
|
|
debug->log(left->dumpSortkey());
|
|
debug->log(" + ");
|
|
|
|
if(!right->sortKey) {
|
|
calculateSortKey(*right);
|
|
}
|
|
debug->log(right->dumpSortkey());
|
|
debug->log(" = ");
|
|
|
|
calculateSortKey(toAdd);
|
|
debug->log(toAdd.dumpSortkey(), TRUE);
|
|
if(noConts > size/2) {
|
|
status = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
UBool
|
|
SortedLines::getExpansionLine(const Line &expansion, const Line &previous, const Line &exp, Line &expansionLine)
|
|
{
|
|
int expIndexSize = 0;
|
|
UColAttributeValue expStrength = UCOL_OFF;
|
|
int32_t comparisonResult = 0;
|
|
int32_t i = 0, k = 0, prevK = 0;
|
|
Line trial;
|
|
UBool sequenceCompleted = FALSE;
|
|
int32_t expIndexes[256];
|
|
int32_t expIndexesSize = 0;
|
|
|
|
if(!sequenceCompleted) {
|
|
expIndexSize = 0;
|
|
expansionLine.clear();
|
|
|
|
// we will start from strength between the expansion
|
|
// and the target (toSort[i] and toSort[j]. First we
|
|
// will add as many primaries as possible. Then we will
|
|
// try to add secondary pieces and then tertiary.
|
|
// found an expansion - what is the expanding sequence?
|
|
|
|
expStrength = UCOL_PRIMARY;
|
|
while(!sequenceCompleted) {
|
|
k = 0;
|
|
prevK = 0;
|
|
while(k < size) {
|
|
if(expansionLine.len > 15) {
|
|
sequenceCompleted = TRUE;
|
|
break;
|
|
}
|
|
while(k < size && toSort[k]->strength != UCOL_PRIMARY)
|
|
{
|
|
k++;
|
|
}
|
|
// nothing found
|
|
if(k == size) {
|
|
break;
|
|
}
|
|
// we need to skip over reordering things. If they were worthy, they would
|
|
// have been detected in the previous iteration.
|
|
//if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
|
|
//k++;
|
|
//continue;
|
|
//}
|
|
trial = previous;
|
|
trial.append(expansionLine);
|
|
trial.append(*toSort[k]);
|
|
if(toSort[k]->name[0] == 0x0067) {
|
|
int32_t putBreakPointHere = 0;
|
|
}
|
|
comparisonResult = probe.compare(trial, expansion);
|
|
if(comparisonResult == 0) {
|
|
expansionLine = *toSort[k];
|
|
return TRUE;
|
|
} else if (comparisonResult > 0) {
|
|
if(prevK) {
|
|
if(exp == *toSort[prevK]) {
|
|
expansionLine = exp;
|
|
return TRUE;
|
|
}
|
|
i = prevK;
|
|
while(i < k-1) {
|
|
i++;
|
|
if(toSort[i]->strength > exp.strength) {
|
|
continue;
|
|
}
|
|
trial = previous;
|
|
trial.append(expansionLine);
|
|
trial.append(*toSort[i]);
|
|
if(probe.compare(trial, expansion) > 0) {
|
|
break;
|
|
}
|
|
}
|
|
// we got into situation where we have ch > ch+dot-below
|
|
// however, ch is a contraction and therefore we cannot use
|
|
// it properly. If we have hit on a contraction, we'll just try
|
|
// to continue. Probably need more logic here.
|
|
if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
|
|
expansionLine.append(*toSort[i-1]);
|
|
expIndexes[expIndexSize++] = i-1;
|
|
break;
|
|
} else {
|
|
int32_t putBreakPointHere = 0;
|
|
}
|
|
} else {
|
|
sequenceCompleted = TRUE;
|
|
break;
|
|
}
|
|
//break;
|
|
}
|
|
prevK = k;
|
|
k++;
|
|
}
|
|
if(!prevK || k == size) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return expIndexSize > 0;
|
|
}
|
|
|
|
int32_t
|
|
SortedLines::gooseUp(int32_t resetIndex, int32_t expansionIndex, Line &expLine, int32_t *expIndexes, int32_t &expIndexSize, UColAttributeValue strength)
|
|
{
|
|
int32_t i = expansionIndex, k = resetIndex+1, n = 0, m = 0, start = 0;
|
|
UBool haveChanges = FALSE;
|
|
Line trial, prefix, suffix;
|
|
// we will first try goosing up the reset index
|
|
//while(toSort[k]->strength >= strength)
|
|
for( ; toSort[k]->strength == strength; k++)
|
|
{
|
|
//if(toSort[k]->strength > strength) {
|
|
//continue;
|
|
//}
|
|
trial.setToConcat(toSort[k], &expLine);
|
|
if(probe.compare(trial, *toSort[i]) > 0) {
|
|
break;
|
|
}
|
|
}
|
|
resetIndex = k-1;
|
|
|
|
// goose up individual characters
|
|
prefix = *toSort[resetIndex];
|
|
for(n = 0; n < expIndexSize; n++) {
|
|
suffix.clear();
|
|
for(m = n+1; m < expIndexSize; m++) {
|
|
suffix.append(*toSort[expIndexes[m]]);
|
|
}
|
|
k = expIndexes[n]+1;
|
|
//while(toSort[k]->strength >= strength)
|
|
for( ; toSort[k]->strength == strength; k++)
|
|
{
|
|
//if(toSort[k]->strength > strength) {
|
|
//continue;
|
|
//}
|
|
trial.setToConcat(&prefix, toSort[k]);
|
|
trial.append(suffix);
|
|
if(probe.compare(trial, *toSort[i]) > 0) {
|
|
break;
|
|
}
|
|
}
|
|
if(k > expIndexes[n]+1) {
|
|
haveChanges = TRUE;
|
|
expIndexes[n] = k-1;
|
|
}
|
|
prefix.append(*toSort[expIndexes[n]]);
|
|
}
|
|
|
|
// try inserting ingorables
|
|
UColAttributeValue lastStr = UCOL_OFF;
|
|
k = 0;
|
|
while(toSort[k]->strengthFromEmpty > strength) {
|
|
k++;
|
|
}
|
|
if(toSort[k]->strengthFromEmpty == strength) {
|
|
start = k;
|
|
prefix = *toSort[resetIndex];
|
|
n = 0;
|
|
while(n <= expIndexSize) {
|
|
suffix.clear();
|
|
for(m = n; m < expIndexSize; m++) {
|
|
suffix.append(*toSort[expIndexes[m]]);
|
|
}
|
|
k = start;
|
|
while(toSort[k]->strengthFromEmpty == strength) {
|
|
trial.setToConcat(&prefix, toSort[k]);
|
|
trial.append(suffix);
|
|
lastStr = probe.getStrength(trial, *toSort[i]);
|
|
if(lastStr == UCOL_OFF) { // shot over - we won't find anything here
|
|
break;
|
|
} else if(lastStr > strength) {
|
|
for(m = expIndexSize; m > n; m--) {
|
|
expIndexes[m] = expIndexes[m-1];
|
|
}
|
|
expIndexes[n] = k;
|
|
expIndexSize++;
|
|
haveChanges = TRUE;
|
|
break;
|
|
}
|
|
#if 0
|
|
if(probe.compare(trial, *toSort[i]) > 0) {
|
|
// if the first one skips, that means that
|
|
// this position doesn't work
|
|
if(k > start) {
|
|
// insert an ignorable on position n
|
|
for(m = expIndexSize; m > n; m--) {
|
|
expIndexes[m] = expIndexes[m-1];
|
|
}
|
|
expIndexes[n] = k-1;
|
|
expIndexSize++;
|
|
haveChanges = TRUE;
|
|
if(n == expIndexSize-1) { // added to the end of the string
|
|
UColAttributeValue str = probe.getStrength(trial, *toSort[i]);
|
|
int32_t putBreakHere = 0;
|
|
}
|
|
}
|
|
break;
|
|
} else {
|
|
lastStr = probe.getStrength(trial, *toSort[i]);
|
|
}
|
|
#endif
|
|
k++;
|
|
}
|
|
prefix.append(*toSort[expIndexes[n]]);
|
|
n++;
|
|
}
|
|
}
|
|
|
|
if(haveChanges) {
|
|
expLine.clear();
|
|
for(m = 0; m < expIndexSize; m++) {
|
|
expLine.append(*toSort[expIndexes[m]]);
|
|
}
|
|
}
|
|
return resetIndex;
|
|
}
|
|
|
|
int32_t
|
|
SortedLines::detectExpansions()
|
|
{
|
|
logger->log("\n*** Detecting expansions\n\n");
|
|
int32_t exCount = 0;
|
|
int32_t i = 0, j = 0, k = 0, prevK = 0;
|
|
Line *previous, trial, expansionLine;
|
|
UBool foundExp = FALSE, sequenceCompleted = FALSE;
|
|
UColAttributeValue strength = UCOL_OFF;
|
|
UColAttributeValue maxStrength = UCOL_IDENTICAL;
|
|
UColAttributeValue expStrength = UCOL_OFF;
|
|
int32_t expIndexes[256];
|
|
int32_t expIndexSize = 0;
|
|
memset(expIndexes, 0, sizeof(expIndexes));
|
|
|
|
// for each element, we look back to find whether there is such a q for which
|
|
// q <n x < qUBn. These are possible expansions. When going backwards we skip
|
|
// over already detected expansions.
|
|
i = 0;
|
|
// it turns out that looking at accents as possible expansions is
|
|
// quite a stupid thing to do, especially on non ICU platforms.
|
|
// Previously this line skipped over identicals only, but
|
|
// now we are going to skip all the way to non-ignorables.
|
|
while(toSort[i]->strengthFromEmpty > UCOL_PRIMARY) {
|
|
i++;
|
|
}
|
|
i++;
|
|
for( ; i < size; i++) {
|
|
if(toSort[i]->name[0]==0x0063 && toSort[i]->name[1] == 0x68) // && toSort[i]->name[1] == 0x308)0043 0043 0219
|
|
{
|
|
int32_t putBreakpointhere = 0;
|
|
}
|
|
foundExp = FALSE;
|
|
sequenceCompleted = FALSE;
|
|
strength = toSort[i]->strength;
|
|
if(strength == UCOL_IDENTICAL && toSort[i-1]->isExpansion == TRUE) {
|
|
u_strcpy(toSort[i]->expansionString, toSort[i-1]->expansionString);
|
|
toSort[i]->expLen = toSort[i-1]->expLen;
|
|
toSort[i]->isExpansion = TRUE;
|
|
toSort[i]->expIndex = toSort[i-1]->expIndex;
|
|
toSort[i]->expStrength = UCOL_IDENTICAL;
|
|
//toSort[i]->expStrength = toSort[i-1]->expStrength;
|
|
foundExp = TRUE;
|
|
sequenceCompleted = TRUE;
|
|
}
|
|
//logger->log("%i %i\n", i, j);
|
|
while(!foundExp && strength <= maxStrength) {
|
|
j = i-1;
|
|
while(j && (toSort[j]->isExpansion == TRUE || toSort[j]->isRemoved == TRUE)) {
|
|
//if(toSort[j]->strength < strength) {
|
|
//strength = toSort[j]->strength;
|
|
//}
|
|
j--;
|
|
}
|
|
|
|
//while(j && toSort[j]->strength > strength)
|
|
while(j && toSort[j]->strength > probe.getStrength(*toSort[j], *toSort[i]))
|
|
{
|
|
j--;
|
|
}
|
|
//if(toSort[j]->strength == strength) {
|
|
previous = toSort[j];
|
|
if(previous->strengthFromEmpty >= UCOL_IDENTICAL ||
|
|
(previous->strengthFromEmpty == UCOL_SECONDARY
|
|
&& strength == UCOL_SECONDARY
|
|
&& previous->lastCC > UB[strength]->firstCC)) {
|
|
break;
|
|
//continue;
|
|
}
|
|
//trial.setToConcat(previous, UB[strength]);
|
|
trial.setToConcat(previous, UB[probe.getStrength(*toSort[j], *toSort[i])]);
|
|
if(probe.compare(trial, *toSort[i]) > 0) {
|
|
foundExp = TRUE;
|
|
}
|
|
//}
|
|
if(strength == UCOL_QUATERNARY) {
|
|
strength = UCOL_IDENTICAL;
|
|
} else {
|
|
strength = (UColAttributeValue)(strength + 1);
|
|
}
|
|
}
|
|
// calculate the expanding sequence
|
|
if(foundExp && !sequenceCompleted) {
|
|
expIndexSize = 0;
|
|
expansionLine.clear();
|
|
exCount++;
|
|
// we will start from strength between the expansion
|
|
// and the target (toSort[i] and toSort[j]. First we
|
|
// will add as many primaries as possible. Then we will
|
|
// try to add secondary pieces and then tertiary.
|
|
// found an expansion - what is the expanding sequence?
|
|
|
|
expStrength = UCOL_PRIMARY;
|
|
while(!sequenceCompleted) {
|
|
k = 0;
|
|
prevK = 0;
|
|
while(k < size) {
|
|
if(expansionLine.len > 15) {
|
|
sequenceCompleted = TRUE;
|
|
break;
|
|
}
|
|
while(k < size && toSort[k]->strength != UCOL_PRIMARY) {
|
|
k++;
|
|
}
|
|
// nothing found
|
|
if(k == size) {
|
|
break;
|
|
}
|
|
// we need to skip over reordering things. If they were worthy, they would
|
|
// have been detected in the previous iteration.
|
|
//if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
|
|
//k++;
|
|
//continue;
|
|
//}
|
|
trial = *previous;
|
|
trial.append(expansionLine);
|
|
trial.append(*toSort[k]);
|
|
if(toSort[k]->name[0] == 0x0067) {
|
|
int32_t putBreakPointHere = 0;
|
|
}
|
|
if(probe.compare(trial, *toSort[i]) > 0) {
|
|
if(prevK) {
|
|
// we got into situation where we have ch > ch+dot-below
|
|
// however, ch is a contraction and therefore we cannot use
|
|
// it properly. If we have hit on a contraction, we'll just try
|
|
// to continue. Probably need more logic here.
|
|
if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
|
|
expansionLine.append(*toSort[prevK]);
|
|
expIndexes[expIndexSize++] = prevK;
|
|
break;
|
|
} else {
|
|
int32_t putBreakPointHere = 0;
|
|
}
|
|
} else {
|
|
sequenceCompleted = TRUE;
|
|
break;
|
|
}
|
|
//break;
|
|
}
|
|
prevK = k;
|
|
k++;
|
|
}
|
|
if(!prevK || k == size) {
|
|
break;
|
|
}
|
|
}
|
|
// after this we have primaries lined up.
|
|
// we are going to goose up with secondaries and
|
|
// tertiaries
|
|
trial.setToConcat(toSort[j], &expansionLine);
|
|
expStrength = probe.getStrength(trial, *toSort[i]);
|
|
if(expStrength > UCOL_PRIMARY) {
|
|
if(expStrength == UCOL_SECONDARY || expStrength == UCOL_OFF) {
|
|
j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_SECONDARY);
|
|
trial.setToConcat(toSort[j], &expansionLine);
|
|
expStrength = probe.getStrength(trial, *toSort[i]);
|
|
if(expStrength == UCOL_TERTIARY) {
|
|
j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
|
|
}
|
|
} else if(expStrength == UCOL_TERTIARY) {
|
|
j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
|
|
}
|
|
}
|
|
trial.setToConcat(toSort[j], &expansionLine);
|
|
expStrength = probe.getStrength(trial, *toSort[i]);
|
|
if(expansionLine.len) {
|
|
if(expansionLine.name[0] == 0x73 && expansionLine.name[1] == 0x7a) {
|
|
int32_t putBreakpointhere = 0;
|
|
}
|
|
UBool isExpansionLineAContraction = (contractionsTable->get(UnicodeString(expansionLine.name, expansionLine.len)) != NULL);
|
|
// we have an expansion line and an expansion. There could be some expansions where
|
|
// the difference between expansion line and the end of expansion sequence is less or
|
|
// equal than the expansion strength. These should probably be removed.
|
|
int32_t diffLen = toSort[i]->len - expansionLine.len;
|
|
if(diffLen > 0) {
|
|
trial.setTo(UnicodeString(toSort[i]->name + diffLen, toSort[i]->len - diffLen));
|
|
} else {
|
|
trial = *toSort[i];
|
|
}
|
|
UColAttributeValue s1 = probe.getStrength(trial, expansionLine);
|
|
if(s1 == UCOL_OFF) {
|
|
s1 = probe.getStrength(expansionLine, trial);
|
|
}
|
|
if((!isExpansionLineAContraction && s1 >= expStrength) || (diffLen <= 0 && s1 == UCOL_IDENTICAL)) {
|
|
contractionsTable->remove(UnicodeString(toSort[i]->name, toSort[i]->len));
|
|
toSort[i]->isRemoved = TRUE;
|
|
if(toSort[i]->next && toSort[i]->previous) {
|
|
toSort[i]->previous->next = toSort[i]->next;
|
|
}
|
|
if(toSort[i]->previous && toSort[i]->next) {
|
|
toSort[i]->next->previous = toSort[i]->previous;
|
|
}
|
|
debug->log("Exp -N: ");
|
|
debug->log(toSort[i]->toString(FALSE));
|
|
debug->log(" / ");
|
|
debug->log(expansionLine.toString(FALSE), TRUE);
|
|
}
|
|
else
|
|
{
|
|
u_strncat(toSort[i]->expansionString, expansionLine.name, expansionLine.len);
|
|
toSort[i]->isExpansion = TRUE;
|
|
toSort[i]->expStrength = expStrength;
|
|
toSort[i]->expLen = expansionLine.len;
|
|
toSort[i]->expansionString[toSort[i]->expLen] = 0;
|
|
toSort[i]->expIndex = j;
|
|
}
|
|
}
|
|
}
|
|
if(toSort[i]->isExpansion == TRUE) {
|
|
if(debug->isOn()) {
|
|
debug->log("Exp + : &");
|
|
debug->log(toSort[j]->toString(FALSE));
|
|
debug->log(toSort[i]->strengthToString(toSort[i]->expStrength, TRUE));
|
|
debug->log(toSort[i]->toString(FALSE));
|
|
debug->log(" ");
|
|
if(!toSort[j]->sortKey) {
|
|
calculateSortKey(*toSort[j]);
|
|
}
|
|
debug->log(toSort[j]->dumpSortkey());
|
|
debug->log(" ... ");
|
|
if(!toSort[i]->sortKey) {
|
|
calculateSortKey(*toSort[i]);
|
|
}
|
|
debug->log(toSort[i]->dumpSortkey());
|
|
calculateSortKey(expansionLine);
|
|
debug->log("/");
|
|
debug->log(expansionLine.dumpSortkey(), TRUE);
|
|
}
|
|
|
|
}
|
|
}
|
|
// after detecting expansions, we want to position them.
|
|
// it is better to position expansions after all have been detected,
|
|
// since otherwise we will change the ordering.
|
|
for(i = size-1; i >= 0; i--) {
|
|
if(toSort[i]->isExpansion) {
|
|
if(toSort[i]->name[0] == 0x2A3) {
|
|
int32_t putBreakPointHere = 0;
|
|
}
|
|
if(i) {
|
|
if(toSort[i]->previous) {
|
|
toSort[i]->previous->next = toSort[i]->next;
|
|
}
|
|
}
|
|
if(i < size-1) {
|
|
if(toSort[i]->next) {
|
|
toSort[i]->next->previous = toSort[i]->previous;
|
|
}
|
|
}
|
|
j = toSort[i]->expIndex;
|
|
toSort[i]->next = toSort[j]->next;
|
|
toSort[i]->previous = toSort[j];
|
|
toSort[j]->next = toSort[i];
|
|
if(toSort[i]->next) {
|
|
toSort[i]->next->previous = toSort[i];
|
|
}
|
|
toSort[i]->strength = toSort[i]->expStrength;
|
|
}
|
|
}
|
|
return exCount;
|
|
}
|
|
|
|
|
|
Line *
|
|
SortedLines::getFirst() {
|
|
current = first;
|
|
return current;
|
|
}
|
|
|
|
Line *
|
|
SortedLines::getLast() {
|
|
current = last;
|
|
return current;
|
|
}
|
|
|
|
void
|
|
SortedLines::add(Line *line, UBool linkIn) {
|
|
if(size++ == capacity) {
|
|
// grow
|
|
}
|
|
lines[size] = *line;
|
|
Line *toAdd = &lines[size];
|
|
if(linkIn && first) {
|
|
Line *current = first;
|
|
while(current != NULL && probe.comparer(¤t, &toAdd) < 0) {
|
|
current = current->next;
|
|
}
|
|
if(current == NULL) {
|
|
toAdd->previous = last;
|
|
toAdd->next = NULL;
|
|
if(last != NULL) {
|
|
last->next = toAdd;
|
|
}
|
|
last = toAdd;
|
|
if(first == NULL) {
|
|
first = toAdd;
|
|
}
|
|
} else { // current != NULL
|
|
toAdd->next = current;
|
|
toAdd->previous = current->previous;
|
|
if(current->previous) {
|
|
current->previous->next = toAdd;
|
|
} else {
|
|
first = toAdd;
|
|
}
|
|
current->previous = toAdd;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
Line *
|
|
SortedLines::getNext()
|
|
{
|
|
if(current != NULL) {
|
|
current=current->next;
|
|
}
|
|
return current;
|
|
}
|
|
|
|
Line *
|
|
SortedLines::getPrevious()
|
|
{
|
|
if(current != NULL) {
|
|
current=current->previous;
|
|
}
|
|
return current;
|
|
}
|
|
|
|
Line *
|
|
SortedLines::operator[](int32_t index)
|
|
{
|
|
int32_t i = 0;
|
|
Line *c = first;
|
|
for(i = 0; i < index; i++) {
|
|
if(c != NULL) {
|
|
c = c->next;
|
|
}
|
|
}
|
|
return c;
|
|
}
|
|
|
|
UnicodeString
|
|
SortedLines::arrayToString(Line** sortedLines, int32_t linesSize, UBool pretty, UBool useLinks, UBool printSortKeys) {
|
|
UnicodeString result;
|
|
int32_t i = 0;
|
|
|
|
Line *line = NULL;
|
|
Line *previous = sortedLines[0];
|
|
if(printSortKeys && !sortkeys) {
|
|
printSortKeys = FALSE;
|
|
}
|
|
if(previous->isReset) {
|
|
result.append(" & ");
|
|
result.append(previous->name, previous->len);
|
|
if(pretty) {
|
|
result.append(" # ");
|
|
result.append(previous->stringToName(previous->name, previous->len));
|
|
result.append("\n");
|
|
}
|
|
} else if(!previous->isRemoved) {
|
|
result.append(previous->toString(pretty));
|
|
if(pretty) {
|
|
result.append("\n");
|
|
}
|
|
}
|
|
i = 1;
|
|
while((i < linesSize && !useLinks) || (previous->next && useLinks)) {
|
|
if(useLinks) {
|
|
line = previous->next;
|
|
} else {
|
|
line = sortedLines[i];
|
|
}
|
|
if(line->isReset) {
|
|
result.append(" &");
|
|
result.append(line->name, line->len);
|
|
if(pretty) {
|
|
result.append(" # ");
|
|
result.append(line->stringToName(line->name, line->len));
|
|
result.append("\n");
|
|
}
|
|
} else if(!line->isRemoved) {
|
|
if(i > 0) {
|
|
result.append(line->strengthToString(line->strength, pretty));
|
|
}
|
|
result.append(line->toString(pretty));
|
|
if(printSortKeys) {
|
|
result.append(line->dumpSortkey());
|
|
}
|
|
if(pretty) {
|
|
result.append("\n");
|
|
}
|
|
}
|
|
previous = line;
|
|
i++;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
SortedLines::SortedLines(FILE *file, UPrinter *logger, UPrinter *debug, UErrorCode &status) :
|
|
toSort(NULL),
|
|
toSortCapacity(0),
|
|
lines(NULL),
|
|
size(0),
|
|
capacity(0),
|
|
first(NULL),
|
|
last(NULL),
|
|
logger(logger),
|
|
debug(debug),
|
|
contractionsTable(NULL),
|
|
duplicators(NULL),
|
|
maxExpansionPrefixSize(0),
|
|
wordSort(FALSE),
|
|
frenchSecondary(FALSE),
|
|
upperFirst(FALSE),
|
|
sortkeys(NULL),
|
|
sortkeyOffset(0)
|
|
{
|
|
debug->log("*** loading a dump\n");
|
|
memset(UB, 0, sizeof(UB));
|
|
int32_t i = 0;
|
|
for(i = 0; i < UCOL_OFF; i++) {
|
|
UB[i] = ∅
|
|
}
|
|
|
|
int32_t newFrench, newUpperFirst;
|
|
fscanf(file, "%i,%i,%i\n", &size, &newFrench, &newUpperFirst);
|
|
debug->log("Read size %i, frenchSecondary %i and upperFirst %i\n", size, newFrench, newUpperFirst);
|
|
frenchSecondary = (UBool)newFrench;
|
|
upperFirst = (UBool)newUpperFirst;
|
|
capacity = size;
|
|
lines = new Line[capacity];
|
|
i = 0;
|
|
|
|
char buff[256];
|
|
|
|
while(fgets(buff, 256, file)) {
|
|
if(i % 20 == 0) {
|
|
logger->log("\rLine: %04i", i, buff);
|
|
}
|
|
lines[i].initFromString(buff, 256, status);
|
|
if(i) {
|
|
lines[i].previous = &lines[i-1];
|
|
lines[i-1].next = &lines[i];
|
|
}
|
|
i++;
|
|
}
|
|
size = i;
|
|
toSort = new Line*[size];
|
|
setSortingArray(toSort, lines, size);
|
|
first = &lines[0];
|
|
last = &lines[size-1];
|
|
}
|
|
|
|
void
|
|
SortedLines::toFile(FILE *file, UBool useLinks, UErrorCode &status)
|
|
{
|
|
fprintf(file, "%i,%i,%i\n", size, frenchSecondary, upperFirst);
|
|
int32_t i = 1;
|
|
Line *previous = toSort[0];
|
|
Line *line = NULL;
|
|
char buff[256];
|
|
previous->write(buff, 256, status);
|
|
fprintf(file, "%s\n", buff);
|
|
fflush(file);
|
|
while(previous->next) {
|
|
if(useLinks) {
|
|
line = previous->next;
|
|
} else {
|
|
line = toSort[i];
|
|
}
|
|
line->write(buff, 256, status);
|
|
fprintf(file, "%s\n", buff);
|
|
i++;
|
|
previous = line;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
UnicodeString
|
|
SortedLines::toStringFromEmpty() {
|
|
UBool useLinks = FALSE;
|
|
UBool pretty = FALSE;
|
|
UnicodeString result;
|
|
int32_t i = 0;
|
|
|
|
Line *line = NULL;
|
|
Line *previous = toSort[0];
|
|
if(previous->isReset) {
|
|
result.append(" & ");
|
|
if(pretty) {
|
|
result.append("\n");
|
|
}
|
|
result.append(previous->name, previous->len);
|
|
} else if(!previous->isRemoved) {
|
|
result.append(previous->toString(pretty));
|
|
if(pretty) {
|
|
result.append("\n");
|
|
}
|
|
}
|
|
i = 1;
|
|
while(i < size || previous->next) {
|
|
if(useLinks) {
|
|
line = previous->next;
|
|
} else {
|
|
line = toSort[i];
|
|
}
|
|
if(line->isReset) {
|
|
result.append(" &");
|
|
result.append(line->name, line->len);
|
|
if(pretty) {
|
|
result.append(" # ");
|
|
result.append(line->stringToName(line->name, line->len));
|
|
result.append("\n");
|
|
}
|
|
} else if(!line->isRemoved) {
|
|
if(i > 0) {
|
|
result.append(line->strengthToString(line->strengthFromEmpty, pretty));
|
|
}
|
|
result.append(line->toString(pretty));
|
|
if(pretty) {
|
|
result.append("\n");
|
|
}
|
|
}
|
|
previous = line;
|
|
i++;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
UnicodeString
|
|
SortedLines::toString(UBool useLinks)
|
|
{
|
|
return arrayToString(toSort, size, FALSE, useLinks, FALSE);
|
|
}
|
|
|
|
|
|
UnicodeString
|
|
SortedLines::toPrettyString(UBool useLinks, UBool printSortKeys)
|
|
{
|
|
return arrayToString(toSort, size, TRUE, useLinks, printSortKeys);
|
|
}
|
|
|
|
UnicodeString
|
|
SortedLines::toOutput(const char *format,
|
|
const char *locale, const char *platform, const char *reference,
|
|
UBool useLinks, UBool initialize, UBool moreToCome) {
|
|
if(strcmp(format, "HTML") == 0) {
|
|
return toHTML(locale, platform, reference, useLinks, initialize, moreToCome);
|
|
} else if(strcmp(format, "XML") == 0) {
|
|
return toXML(locale, platform, reference, useLinks, initialize, moreToCome);
|
|
} else {
|
|
return toBundle(locale, platform, reference, useLinks, initialize, moreToCome);
|
|
}
|
|
}
|
|
|
|
|
|
UnicodeString
|
|
SortedLines::toHTML(const char *locale,
|
|
const char *platform, const char *reference,
|
|
UBool useLinks, UBool initialize, UBool moreToCome)
|
|
{
|
|
UnicodeString result;
|
|
int32_t i = 0;
|
|
if(initialize) {
|
|
result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
|
|
result.append("# Collation data resource bundle generated for locale: ");
|
|
result.append(locale);
|
|
result.append("<br>\n# For platform ");
|
|
result.append(platform);
|
|
result.append(" reference platform ");
|
|
result.append(reference);
|
|
result.append("<br><br>\n\n\n");
|
|
|
|
result.append(locale);
|
|
if(platform) {
|
|
result.append("_");
|
|
result.append(platform);
|
|
}
|
|
if(reference) {
|
|
result.append("_vs_");
|
|
result.append(reference);
|
|
}
|
|
result.append(" {<br>\n");
|
|
|
|
result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n");
|
|
}
|
|
|
|
if(frenchSecondary) {
|
|
result.append("[backwards 2]<br>\n");
|
|
}
|
|
if(upperFirst) {
|
|
result.append("[casefirst upper]<br>\n");
|
|
}
|
|
|
|
Line *line = toSort[0];
|
|
|
|
i = 0;
|
|
while((i < size && !useLinks) || (line->next && useLinks)) {
|
|
if(line->isReset || !line->isRemoved) {
|
|
result.append(line->toHTMLString());
|
|
}
|
|
i++;
|
|
if(useLinks) {
|
|
line = line->next;
|
|
} else {
|
|
line = toSort[i];
|
|
}
|
|
}
|
|
if(!moreToCome) {
|
|
result.append(" }<br>\n }<br>\n }<br>\n}<br>\n");
|
|
|
|
result.append("</html>\n");
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
UnicodeString
|
|
SortedLines::toXML(const char *locale,
|
|
const char *platform, const char *reference,
|
|
UBool useLinks, UBool initialize, UBool moreToCome)
|
|
{
|
|
UnicodeString result;
|
|
int32_t i = 0;
|
|
if(initialize) {
|
|
result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
|
|
result.append("# Collation data resource bundle generated for locale: ");
|
|
result.append(locale);
|
|
result.append("<br>\n# For platform ");
|
|
result.append(platform);
|
|
result.append(" reference platform ");
|
|
result.append(reference);
|
|
result.append("<br><br>\n\n\n");
|
|
|
|
result.append(locale);
|
|
if(platform) {
|
|
result.append("_");
|
|
result.append(platform);
|
|
}
|
|
if(reference) {
|
|
result.append("_vs_");
|
|
result.append(reference);
|
|
}
|
|
result.append(" {<br>\n");
|
|
|
|
result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n");
|
|
}
|
|
|
|
if(frenchSecondary) {
|
|
result.append("[backwards 2]<br>\n");
|
|
}
|
|
if(upperFirst) {
|
|
result.append("[casefirst upper]<br>\n");
|
|
}
|
|
|
|
Line *line = toSort[0];
|
|
|
|
i = 0;
|
|
while((i < size && !useLinks) || (line->next && useLinks)) {
|
|
if(line->isReset || !line->isRemoved) {
|
|
result.append(line->toHTMLString());
|
|
}
|
|
i++;
|
|
if(useLinks) {
|
|
line = line->next;
|
|
} else {
|
|
line = toSort[i];
|
|
}
|
|
}
|
|
if(!moreToCome) {
|
|
result.append(" }<br>\n }<br>\n }<br>\n}<br>\n");
|
|
|
|
result.append("</html>\n");
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
UnicodeString
|
|
SortedLines::toBundle(const char *locale,
|
|
const char *platform, const char *reference,
|
|
UBool useLinks, UBool initialize, UBool moreToCome)
|
|
{
|
|
UnicodeString result;
|
|
int32_t i = 0;
|
|
|
|
if(initialize) {
|
|
result.append("// Collation data resource bundle generated for locale: ");
|
|
result.append(locale);
|
|
result.append("\n// For platform ");
|
|
result.append(platform);
|
|
result.append(" reference platform ");
|
|
result.append(reference);
|
|
result.append("\n\n\n");
|
|
|
|
result.append(locale);
|
|
/*
|
|
if(platform) {
|
|
result.append("_");
|
|
result.append(platform);
|
|
}
|
|
if(reference) {
|
|
result.append("_vs_");
|
|
result.append(reference);
|
|
}
|
|
*/
|
|
result.append(" {\n");
|
|
|
|
result.append(" collations {\n standard {\n Sequence {\n");
|
|
}
|
|
|
|
if(frenchSecondary) {
|
|
result.append("[backwards 2]\n");
|
|
}
|
|
if(upperFirst) {
|
|
result.append("[casefirst upper]\n");
|
|
}
|
|
|
|
Line *line = toSort[0];
|
|
|
|
i = 0;
|
|
while((i < size && !useLinks) || (line->next && useLinks)) {
|
|
if(line->isReset || !line->isRemoved) {
|
|
result.append(line->toBundleString());
|
|
}
|
|
i++;
|
|
if(useLinks) {
|
|
line = line->next;
|
|
} else {
|
|
line = toSort[i];
|
|
}
|
|
}
|
|
|
|
if(!moreToCome) {
|
|
result.append(" }\n }\n }\n}\n");
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
int32_t
|
|
SortedLines::getSize() const {
|
|
return repertoire.size();
|
|
}
|
|
|
|
void
|
|
SortedLines::reduceDifference(SortedLines& reference) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
if(upperFirst) {
|
|
swapCase();
|
|
}
|
|
// both sorted lines structures need to have established links and strengths
|
|
// We walk down both structures and note differences. These
|
|
// differences will modify this by removng elements, setting resets
|
|
// etc...
|
|
// we will prefer insertions from tailoring to reference, then deletions
|
|
// there are two tables that keep seen elements.
|
|
Hashtable *seenThis = new Hashtable();
|
|
Hashtable *seenReference = new Hashtable();
|
|
|
|
|
|
UBool found = FALSE;
|
|
UBool finished = FALSE;
|
|
const int32_t lookForward = 20;
|
|
int32_t tailoringMove = 0;
|
|
//int32_t referenceSize = reference.getSize();
|
|
Line *refLine = reference.getFirst();
|
|
Line *refLatestEqual = refLine;
|
|
refLine = refLine->next;
|
|
Line *myLine = getFirst();
|
|
Line *myLatestEqual = myLine;
|
|
myLatestEqual->isRemoved = TRUE;
|
|
myLine = myLine->next;
|
|
while(myLine && refLine) {
|
|
found = FALSE;
|
|
while(myLine && refLine && myLine->equals(*refLine)) {
|
|
myLatestEqual = myLine;
|
|
myLatestEqual->isRemoved = TRUE;
|
|
myLine = myLine->next;
|
|
refLatestEqual = refLine;
|
|
refLine = refLine->next;
|
|
if(refLine == NULL && myLine == NULL) {
|
|
finished = TRUE;
|
|
}
|
|
}
|
|
if(myLine) {
|
|
myLine->cumulativeStrength = myLine->strength;
|
|
}
|
|
if(refLine) {
|
|
refLine->cumulativeStrength = refLine->strength;
|
|
}
|
|
|
|
// here is the difference
|
|
while(!found && !finished) {
|
|
tailoringMove = 0;
|
|
if(myLine && refLine) {
|
|
if(myLine->cumulativeStrength > refLine->cumulativeStrength) {
|
|
// tailoring z <<< x, UCA z < y
|
|
while(myLine->cumulativeStrength > refLine->cumulativeStrength) {
|
|
myLine = myLine->next;
|
|
if(myLine) {
|
|
transferCumulativeStrength(myLine->previous, myLine);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
} else if(myLine->cumulativeStrength < refLine->cumulativeStrength) {
|
|
// tailoring z < x, UCA z <<< y
|
|
while(myLine->cumulativeStrength < refLine->cumulativeStrength) {
|
|
seenReference->put(UnicodeString(refLine->name, refLine->len), refLine, status);
|
|
refLine = refLine->next;
|
|
if(refLine) {
|
|
transferCumulativeStrength(refLine->previous, refLine);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// this is the interesting point. Now we search for character match
|
|
while(myLine && refLine && (!myLine->equals(*refLine) || myLine->strength == UCOL_IDENTICAL)
|
|
&& tailoringMove < lookForward) {
|
|
if(seenThis->get(UnicodeString(refLine->name, refLine->len))) {
|
|
// we are not interested in stuff from the reference that is already accounted
|
|
// for in the tailoring.
|
|
refLine = refLine->next;
|
|
if(refLine) {
|
|
transferCumulativeStrength(refLine->previous, refLine);
|
|
}
|
|
} else {
|
|
myLine = myLine->next;
|
|
if(myLine) {
|
|
transferCumulativeStrength(myLine->previous, myLine);
|
|
if(!seenReference->get(UnicodeString(myLine->name, myLine->len))) {
|
|
tailoringMove++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(refLine == NULL) { // ran out of reference
|
|
// this is the tail of tailoring - the last insertion
|
|
myLine = NULL;
|
|
found = TRUE;
|
|
} else if(tailoringMove == lookForward || myLine == NULL) { // run over treshold or out of tailoring
|
|
tailoringMove = 0;
|
|
// we didn't find insertion after all
|
|
// we will try substitution next
|
|
// reset the tailoring pointer
|
|
myLine = myLatestEqual->next;
|
|
// move the reference
|
|
refLine = refLine->next;
|
|
if(refLine) {
|
|
transferCumulativeStrength(refLine->previous, refLine);
|
|
}
|
|
} else { // we found an insertion
|
|
tailoringMove = 0;
|
|
if(myLine->strength != refLine->strength) {
|
|
while(myLine && refLine && *myLine == *refLine
|
|
&& (myLine->strength != refLine->strength
|
|
|| myLine->strength == UCOL_IDENTICAL)) {
|
|
myLine = myLine->next;
|
|
refLine = refLine->next;
|
|
}
|
|
if(*myLine != *refLine) {
|
|
continue;
|
|
}
|
|
}
|
|
if(myLine && refLine && myLine->previous->strength < myLine->strength) {
|
|
myLine = myLine->next;
|
|
refLine = refLine->next;
|
|
if(*myLine != *refLine) {
|
|
continue;
|
|
}
|
|
}
|
|
found = TRUE;
|
|
}
|
|
if(found) {
|
|
if(myLatestEqual->next != myLine || refLine == NULL) {
|
|
Line *myStart = NULL;
|
|
// this is a reset and a sequence
|
|
// myLatestEqual points at the last point that was the same
|
|
// This point will be a reset
|
|
if(myLine && refLine) { // if there is anything more to do - it might be worth saving it
|
|
myStart = myLatestEqual;
|
|
while(myStart != myLine) {
|
|
seenThis->put(UnicodeString(myStart->name, myStart->len), myStart, status);
|
|
myStart = myStart->next;
|
|
}
|
|
}
|
|
// Try to weed out stuff that is not affected, like:
|
|
// Tailoring:
|
|
// <<<S<<\u017F<\u0161<<<\u0160<t
|
|
// UCA:
|
|
// <<<S<<\u0161<<<\u0160<<\u017F<t
|
|
// Result:
|
|
// &S<<\u017F<\u0161<<<\u0160
|
|
// we have a sequence that spans from myLatestEqual to myLine (that one could be NULL,
|
|
// so we have to go down from myLatestEqual.
|
|
// Basically, for every element, we want to see the strongest cumulative difference
|
|
// from the reset point. If the cumulative difference is the same in both the reference and
|
|
// tailoring, that element could be removed.
|
|
calculateCumulativeStrengths(myLatestEqual, myLine);
|
|
calculateCumulativeStrengths(refLatestEqual, refLine);
|
|
myStart = myLatestEqual;
|
|
int32_t removed = 0;
|
|
int32_t traversed = 0;
|
|
while(myStart && myStart != myLine) {
|
|
Line *refStart = refLatestEqual;
|
|
while(refStart && refStart != refLine) {
|
|
if(*myStart == *refStart) {
|
|
if(myStart->cumulativeStrength == refStart->cumulativeStrength) {
|
|
myStart->isRemoved = TRUE;
|
|
removed++;
|
|
}
|
|
}
|
|
refStart = refStart->next;
|
|
}
|
|
myStart = myStart->next;
|
|
traversed++;
|
|
}
|
|
if(removed < traversed) {
|
|
myLatestEqual->isReset = TRUE;
|
|
myLatestEqual->isRemoved = FALSE;
|
|
}
|
|
|
|
myLatestEqual = myLine;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if(upperFirst) {
|
|
//swapCase();
|
|
}
|
|
|
|
delete seenThis;
|
|
delete seenReference;
|
|
|
|
}
|
|
|
|
void
|
|
SortedLines::transferCumulativeStrength(Line *previous, Line *that) {
|
|
if(that->strength > previous->cumulativeStrength) {
|
|
that->cumulativeStrength = previous->cumulativeStrength;
|
|
} else {
|
|
that->cumulativeStrength = that->strength;
|
|
}
|
|
}
|
|
|
|
void
|
|
SortedLines::calculateCumulativeStrengths(Line *start, Line *end) {
|
|
// start is a reset - end may be NULL
|
|
start = start->next;
|
|
UColAttributeValue cumulativeStrength = UCOL_OFF;
|
|
while(start && start != end) {
|
|
if(start->strength < cumulativeStrength) {
|
|
cumulativeStrength = start->strength;
|
|
}
|
|
start->cumulativeStrength = cumulativeStrength;
|
|
start = start->next;
|
|
}
|
|
}
|
|
|
|
|
|
void
|
|
SortedLines::getRepertoire(UnicodeSet &fillIn) {
|
|
fillIn.clear();
|
|
fillIn.addAll(repertoire);
|
|
}
|
|
|
|
|
|
void
|
|
SortedLines::removeDecompositionsFromRepertoire() {
|
|
UnicodeSetIterator repertoireIter(repertoire);
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UChar string[256];
|
|
UChar composed[256];
|
|
int32_t len = 0, compLen = 0;
|
|
UnicodeString compString;
|
|
UnicodeSet toRemove;
|
|
|
|
while(repertoireIter.next()) {
|
|
len = 0;
|
|
if(repertoireIter.isString()) { // process a string
|
|
len = repertoireIter.getString().length();
|
|
u_memcpy(string, repertoireIter.getString().getBuffer(), len);
|
|
} else { // process code point
|
|
UBool isError = FALSE;
|
|
U16_APPEND(string, len, 25, repertoireIter.getCodepoint(), isError);
|
|
}
|
|
string[len] = 0; // zero terminate, for our evil ways
|
|
compLen = unorm_normalize(string, len, UNORM_NFC, 0, composed, 256, &status);
|
|
if(compLen != len || u_strcmp(string, composed) != 0) {
|
|
compString.setTo(composed, compLen);
|
|
if(repertoire.contains(compString)) {
|
|
toRemove.add(UnicodeString(string, len));
|
|
}
|
|
}
|
|
}
|
|
debug->log("\nRemoving\n");
|
|
debug->log(toRemove.toPattern(compString, TRUE), TRUE);
|
|
repertoire.removeAll(toRemove);
|
|
}
|
|
|
|
|
|
void
|
|
SortedLines::swapCase()
|
|
{
|
|
int32_t i = 0;
|
|
for(i = 0; i < size; i++) {
|
|
toSort[i]->swapCase();
|
|
}
|
|
}
|
|
|
|
void
|
|
SortedLines::calculateSortKey(Line &line)
|
|
{
|
|
if(!sortkeys) {
|
|
sortkeys = new uint8_t[size*1024];
|
|
memset(sortkeys, 0, size*1024);
|
|
}
|
|
line.sortKey = sortkeys+sortkeyOffset;
|
|
sortkeyOffset += probe.getSortKey(line, sortkeys+sortkeyOffset, size*256-sortkeyOffset);
|
|
}
|
|
|
|
|
|
void
|
|
SortedLines::calculateSortKeys()
|
|
{
|
|
if(sortkeys) {
|
|
delete[] sortkeys;
|
|
}
|
|
sortkeyOffset = 0;
|
|
sortkeys = new uint8_t[size*256];
|
|
memset(sortkeys, 0, size*256);
|
|
int32_t i = 0;
|
|
for(i = 0; i < size; i++) {
|
|
calculateSortKey(*toSort[i]);
|
|
}
|
|
}
|