2017-10-03 02:32:50 +00:00
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
2006-10-27 00:03:21 +00:00
# include "sortedlines.h"
static int codePointCmp ( const void * a , const void * b ) {
return u_strcmp ( ( * ( Line * * ) a ) - > name , ( * ( Line * * ) b ) - > name ) ;
}
SortedLines : : SortedLines ( const UnicodeSet & set , const UnicodeSet & excludeBounds , const StrengthProbe & probe ,
UPrinter * logger , UPrinter * debug ) :
toSort ( NULL ) ,
toSortCapacity ( 0 ) ,
lines ( NULL ) ,
size ( 0 ) ,
capacity ( 0 ) ,
repertoire ( set ) ,
excludeBounds ( excludeBounds ) ,
probe ( probe ) ,
first ( NULL ) ,
last ( NULL ) ,
logger ( logger ) ,
debug ( debug ) ,
contractionsTable ( NULL ) ,
duplicators ( NULL ) ,
maxExpansionPrefixSize ( 0 ) ,
wordSort ( FALSE ) ,
frenchSecondary ( FALSE ) ,
upperFirst ( FALSE ) ,
sortkeys ( NULL ) ,
sortkeyOffset ( 0 )
{
memset ( UB , 0 , sizeof ( UB ) ) ;
int32_t i = 0 ;
for ( i = 0 ; i < UCOL_OFF ; i + + ) {
UB [ i ] = & empty ;
}
init ( ) ;
}
SortedLines : : ~ SortedLines ( )
{
delete [ ] lines ;
if ( sortkeys ) {
delete [ ] sortkeys ;
}
if ( toSort ) {
delete [ ] toSort ;
}
if ( contractionsTable ) {
delete contractionsTable ;
}
if ( duplicators ) {
delete duplicators ;
}
}
void
SortedLines : : getBounds ( UErrorCode & status ) {
// first sort through the set
debug - > log ( toString ( ) , TRUE ) ;
int32_t i = 0 , j = 0 ;
UColAttributeValue strength = UCOL_OFF ;
for ( i = 0 ; i < size ; i + + ) {
if ( toSort [ i ] - > strengthFromEmpty < strength ) {
if ( i & & strength < UCOL_OFF ) {
//u_strcpy(UB[strength], toSort[i-1]->name);
j = 1 ;
while ( excludeBounds . contains ( UnicodeString ( toSort [ i - j ] - > name , toSort [ i - j ] - > len ) ) ) {
j + + ;
}
UB [ strength ] = toSort [ i - j ] ;
}
strength = toSort [ i ] - > strengthFromEmpty ;
if ( strength = = UCOL_PRIMARY ) {
probe . SE = toSort [ i ] - > name [ 0 ] ;
}
}
}
//u_strcpy(UB[strength], toSort[size-1]->name);
// a different solution for bounds: go from end and see if the guys on the top
// cause duplication for things
UChar dupch [ ] = { 0x0020 , 0x0030 , 0x0042 , 0x0051 , 0x0062 , 0x0071 , 0x0391 , 0x0396 , 0x03b1 , 0x03b6 } ;
j = 1 ;
Line dup ;
Line bound ;
int32_t dups = 0 ;
while ( j < size ) {
dups = 0 ;
for ( i = 0 ; i < sizeof ( dupch ) / sizeof ( dupch [ 0 ] ) ; i + + ) {
dup . setTo ( dupch [ i ] ) ;
dup . append ( dupch [ i ] ) ;
bound . setTo ( dupch [ i ] ) ;
bound . append ( toSort [ size - j ] - > name , toSort [ size - j ] - > len ) ;
if ( probe . getStrength ( dup , bound ) > = UCOL_IDENTICAL ) {
dups + + ;
}
}
if ( dups = = 0 ) {
break ;
} else {
if ( ! duplicators ) {
duplicators = new Hashtable ( ) ;
}
duplicators - > put ( UnicodeString ( toSort [ size - j ] - > name , toSort [ size - j ] - > len ) , & toSort [ size - j ] , status ) ;
debug - > log ( toSort [ size - j ] - > toString ( ) ) ;
debug - > log ( " is not good enough to be an upper bound \n " ) ;
j + + ;
}
}
if ( j = = size ) {
debug - > log ( " Oi! I'm hallucinating. Will use the first upper bound " ) ;
delete duplicators ;
duplicators = NULL ;
j = 1 ;
}
/*
j = 1 ;
while ( excludeBounds . contains ( UnicodeString ( toSort [ size - j ] - > name , toSort [ size - j ] - > len ) ) ) {
j + + ;
}
*/
UB [ strength ] = toSort [ size - j ] ;
for ( i = 0 ; i < UCOL_OFF ; i + + ) {
if ( UB [ i ] ) {
//debug->log(UB[i], TRUE);
debug - > log ( UB [ i ] - > toString ( TRUE ) , TRUE ) ;
}
}
}
// classifies repertoire according to the strength of their difference
// from the empty string
void
SortedLines : : classifyRepertoire ( ) {
UColAttributeValue strongestStrengthFromEmpty = UCOL_OFF ;
int32_t lastChange = 0 ;
int32_t i = 0 , j = 0 ;
while ( i < size ) // && probe.distanceFromEmptyString(*toSort[i]) > UCOL_PRIMARY)
{
toSort [ i ] - > strengthFromEmpty = probe . distanceFromEmptyString ( * toSort [ i ] ) ;
if ( toSort [ i ] - > strengthFromEmpty < strongestStrengthFromEmpty ) {
strongestStrengthFromEmpty = toSort [ i ] - > strengthFromEmpty ;
lastChange = i ;
} else if ( toSort [ i ] - > strengthFromEmpty > strongestStrengthFromEmpty ) {
// there is a problem in detection. Most probably a quaternary.
// why don't we try to interpolate
UColAttributeValue nextStrength = UCOL_OFF ;
UColAttributeValue prevStrength = UCOL_OFF ;
UColAttributeValue st = UCOL_OFF ;
logger - > log ( " Interpolating to get the distance from empty for Line " ) ;
logger - > log ( toSort [ i ] - > toString ( TRUE ) , TRUE ) ;
if ( i ) {
st = probe . getStrength ( * toSort [ i - 1 ] , * toSort [ i ] ) ;
if ( st = = UCOL_OFF ) {
logger - > log ( " Cannot deduce distance from empty using previous element. Something is very wrong! Line: " ) ;
logger - > log ( toSort [ i ] - > toString ( TRUE ) , TRUE ) ;
} else if ( st = = UCOL_IDENTICAL | | st > = toSort [ i - 1 ] - > strengthFromEmpty ) {
prevStrength = toSort [ i - 1 ] - > strengthFromEmpty ;
} else if ( st < toSort [ i - 1 ] - > strengthFromEmpty ) {
prevStrength = st ;
}
toSort [ i ] - > strengthFromEmpty = prevStrength ;
}
if ( i < size - 2 ) {
toSort [ i + 1 ] - > strengthFromEmpty = probe . distanceFromEmptyString ( * toSort [ i + 1 ] ) ;
st = probe . getStrength ( * toSort [ i + 1 ] , * toSort [ i ] ) ;
if ( st = = UCOL_OFF ) {
logger - > log ( " Cannot deduce distance from empty using next element. Something is very wrong! Line: " ) ;
logger - > log ( toSort [ i ] - > toString ( TRUE ) , TRUE ) ;
} else if ( st = = UCOL_IDENTICAL | | st < toSort [ i + 1 ] - > strengthFromEmpty ) {
nextStrength = toSort [ i + 1 ] - > strengthFromEmpty ;
} else if ( st > = toSort [ i + 1 ] - > strengthFromEmpty ) {
nextStrength = st ;
}
if ( i ) {
if ( prevStrength ! = nextStrength ) {
logger - > log ( " Inconsistent results from interpolation! Results will most likely be wrong \n " ) ;
}
}
toSort [ i ] - > strengthFromEmpty = nextStrength ;
}
/*
UColAttributeValue problemStrength = UCOL_PRIMARY ;
for ( j = lastChange ; j < i ; j + + ) {
if ( toSort [ j ] - > strength > problemStrength ) {
problemStrength = toSort [ j ] - > strength ;
}
}
for ( j = lastChange ; j < i ; j + + ) {
toSort [ j ] - > strengthFromEmpty = problemStrength ;
}
strongestStrengthFromEmpty = toSort [ i ] - > strengthFromEmpty ;
lastChange = i ;
debug - > log ( " Problem detected in distances from empty. Most probably word sort is on \n " ) ;
*/
wordSort = TRUE ;
}
i + + ;
}
debug - > log ( " Distances from empty string \n " ) ;
debug - > log ( toStringFromEmpty ( ) , TRUE ) ;
}
void
SortedLines : : analyse ( UErrorCode & status ) {
frenchSecondary = probe . isFrenchSecondary ( status ) ;
if ( U_FAILURE ( status ) ) {
logger - > log ( " Test for French secondary failed. Bailing out! \n " ) ;
return ;
}
logger - > log ( " French secondary value is %i \n " , frenchSecondary , frenchSecondary ) ;
upperFirst = probe . isUpperFirst ( status ) ;
if ( U_FAILURE ( status ) ) {
logger - > log ( " Test for upper first failed. Bailing out! \n " ) ;
return ;
}
logger - > log ( " upper first value is %i \n " , upperFirst , upperFirst ) ;
sort ( TRUE , TRUE ) ;
classifyRepertoire ( ) ;
getBounds ( status ) ;
//sort(TRUE, TRUE);
addContractionsToRepertoire ( status ) ;
//sort(TRUE, TRUE);
debug - > log ( " \n *** Order after detecting contractions \n \n " ) ;
calculateSortKeys ( ) ;
debug - > log ( toPrettyString ( FALSE , TRUE ) , TRUE ) ;
detectExpansions ( ) ;
}
void SortedLines : : init ( )
{
size = repertoire . size ( ) ;
capacity = 5 * size ;
lines = new Line [ capacity ] ;
init ( repertoire , lines ) ;
}
void SortedLines : : init ( UnicodeSet & rep , Line * lin )
{
UnicodeSetIterator exemplarUSetIter ( rep ) ;
int32_t size = 0 ;
while ( exemplarUSetIter . next ( ) ) {
Line * currLine = lin + size ;
if ( exemplarUSetIter . isString ( ) ) { // process a string
currLine - > setTo ( exemplarUSetIter . getString ( ) ) ;
} else { // process code point
currLine - > setTo ( exemplarUSetIter . getCodepoint ( ) ) ;
}
currLine - > name [ currLine - > len ] = 0 ; // zero terminate, for our evil ways
//currLine->index = size;
size + + ;
}
}
void
SortedLines : : setSortingArray ( Line * * sortingArray , Line * elements , int32_t sizeToSort ) {
int32_t i = 0 ;
for ( i = 0 ; i < sizeToSort ; i + + ) {
sortingArray [ i ] = & elements [ i ] ;
}
}
int32_t
SortedLines : : setSortingArray ( Line * * sortingArray , Hashtable * table ) {
int32_t size = table - > count ( ) ;
int32_t hashIndex = - 1 ;
const UHashElement * hashElement = NULL ;
int32_t count = 0 ;
while ( ( hashElement = table - > nextElement ( hashIndex ) ) ! = NULL ) {
sortingArray [ count + + ] = ( Line * ) hashElement - > value . pointer ;
}
return size ;
}
void
SortedLines : : sort ( Line * * sortingArray , int32_t sizeToSort , UBool setStrengths , UBool link ) {
int32_t i = 0 ;
int32_t equalStart = 0 ;
UColAttributeValue equalStrength = UCOL_OFF ;
qsort ( sortingArray , sizeToSort , sizeof ( Line * ) , probe . comparer ) ;
if ( setStrengths ) { // analyze strengths
for ( i = 1 ; i < sizeToSort ; i + + ) {
sortingArray [ i ] - > strength = probe . getStrength ( * sortingArray [ i - 1 ] , * sortingArray [ i ] ) ;
}
// for equal guys, do the code point ordering
i = 1 ;
while ( i < sizeToSort )
{
if ( sortingArray [ i ] - > strength = = UCOL_IDENTICAL ) {
equalStart = i - 1 ;
equalStrength = sortingArray [ equalStart ] - > strength ;
sortingArray [ equalStart ] - > strength = UCOL_IDENTICAL ;
while ( i < sizeToSort & & sortingArray [ i ] - > strength = = UCOL_IDENTICAL ) {
i + + ;
}
qsort ( sortingArray + equalStart , i - equalStart , sizeof ( Line * ) , codePointCmp ) ;
sortingArray [ equalStart ] - > strength = equalStrength ;
} else {
i + + ;
}
}
}
if ( link ) { // do the linking
for ( i = 0 ; i < sizeToSort - 1 ; i + + ) {
Line * curr = * ( sortingArray + i ) ;
curr - > next = * ( sortingArray + i + 1 ) ;
( * ( sortingArray + i + 1 ) ) - > previous = curr ;
}
}
}
void
SortedLines : : sort ( UBool setStrengths , UBool link ) {
if ( toSortCapacity < size | | ! toSort ) {
if ( toSort ) {
delete [ ] toSort ;
}
toSort = new Line * [ size * 2 ] ;
toSortCapacity = size * 2 ;
}
setSortingArray ( toSort , lines , size ) ;
sort ( toSort , size , setStrengths , link ) ;
first = last = NULL ;
if ( link ) { // do the linking
first = * toSort ;
last = * ( toSort + size - 1 ) ;
}
}
void
SortedLines : : updateBounds ( UnicodeSet & set ) {
Line line ;
UnicodeString s1 ;
UnicodeSetIterator it1 ( set ) ;
while ( it1 . next ( ) ) {
if ( ! debug - > isOn ( ) ) {
logger - > log ( " . " ) ;
}
if ( it1 . isString ( ) ) { // process a string
s1 . setTo ( it1 . getString ( ) ) ;
} else { // process code point
s1 . setTo ( it1 . getCodepoint ( ) ) ;
}
//line.setTo(s1);
UColAttributeValue strength = probe . distanceFromEmptyString ( s1 ) ;
if ( probe . compare ( UnicodeString ( UB [ strength ] - > name ) , s1 ) < 0 ) {
// TODO: leak here - fixit!
UB [ strength ] = new Line ( s1 ) ;
//u_strcpy(UB[strength], s1.getTerminatedBuffer());
}
}
}
void SortedLines : : addAll ( Line * toAdd , int32_t toAddSize )
{
if ( size + toAddSize > capacity ) {
int32_t doGrowingBreakpoint = 0 ;
// we need to do growing here
}
int32_t i = 0 ;
for ( i = 0 ; i < toAddSize ; i + + ) {
lines [ size + i ] = toAdd [ i ] ;
}
size + = toAddSize ;
}
void SortedLines : : setDistancesFromEmpty ( Line * array , int32_t arraySize )
{
int32_t i = 0 ;
for ( i = 0 ; i < arraySize ; i + + ) {
array [ i ] . strengthFromEmpty = probe . distanceFromEmptyString ( array [ i ] ) ;
}
}
// adds contractions in to repertoire
int32_t SortedLines : : addContractionsToRepertoire ( UErrorCode & status )
{
logger - > log ( " \n *** Detecting contractions \n \n " ) ;
contractionsTable = new Hashtable ( ) ;
int32_t noConts = 0 ;
int32_t allocateSize = 50 * size ;
// first check for simple contractions
Line * delta = new Line [ allocateSize ] ;
Line * * deltaSorted = new Line * [ allocateSize ] ;
Line * lesserToAddTo = new Line [ allocateSize ] ;
Line * newDelta = new Line [ allocateSize ] ;
Line * * newDeltaSorted = new Line * [ allocateSize ] ;
Line * deltaP = delta ;
Line * * deltaPP = deltaSorted ;
Line * newDeltaP = newDelta ;
int32_t deltaSize = 0 , lesserToAddToSize = 0 , newDeltaSize = 0 ;
logger - > log ( " ++ Contraction detection generation 0 \n " ) ;
noConts = detectContractions ( toSort , size , toSort , size ,
delta , deltaSize , lesserToAddTo , lesserToAddToSize , 3 * size , status ) ;
setSortingArray ( deltaSorted , delta , deltaSize ) ;
sort ( deltaSorted , deltaSize , TRUE ) ;
setDistancesFromEmpty ( delta , deltaSize ) ;
int32_t deltaPSize = deltaSize ;
//updateBounds(delta);
int32_t generation = 0 ;
// if we found any, we have to try multiple contractions
// However, we want to prevent the contractions explosion
// if the number of simple contractions is greater than the
// starting size, chances are that we either have an algorithmic
// contraction (like iteration marks on w2k) or something
// is seriosly wrong.
if ( deltaPSize < size / 2 ) {
while ( deltaPSize & & generation < 1 ) {
generation + + ;
logger - > log ( " \n ++ Contraction detection generation %i \n " , generation , generation ) ;
// find more, but avoid testing the combinations we already have
noConts + = detectContractions ( toSort , size , deltaPP , deltaPSize ,
newDeltaP , newDeltaSize , lesserToAddTo , lesserToAddToSize , 3 * size , status ) ;
noConts + = detectContractions ( deltaPP , deltaPSize , toSort , size ,
newDeltaP , newDeltaSize , lesserToAddTo , lesserToAddToSize , 3 * size , status ) ;
calculateSortKeys ( ) ;
addAll ( deltaP , deltaPSize ) ;
setSortingArray ( toSort , lines , size ) ;
sort ( TRUE , TRUE ) ;
setSortingArray ( newDeltaSorted , newDeltaP , newDeltaSize ) ;
sort ( newDeltaSorted , newDeltaSize , TRUE ) ;
// if no new ones, bail
//if (newDeltaSize == 0) break;
deltaPSize = newDeltaSize ;
newDeltaSize = 0 ;
if ( deltaP = = delta ) {
deltaP = newDelta ;
deltaPP = newDeltaSorted ;
newDeltaP = delta ;
} else {
deltaP = delta ;
deltaPP = deltaSorted ;
newDeltaP = newDelta ;
}
setDistancesFromEmpty ( deltaP , deltaPSize ) ;
}
}
status = U_ZERO_ERROR ;
// add stuff from the last batch
addAll ( deltaP , deltaPSize ) ;
// warning: we don't add the lesser ones in recursively, since they will
// infinitely loop
setDistancesFromEmpty ( lesserToAddTo , lesserToAddToSize ) ;
addAll ( lesserToAddTo , lesserToAddToSize ) ;
setSortingArray ( toSort , lines , size ) ;
sort ( TRUE , TRUE ) ;
delete [ ] deltaSorted ;
delete [ ] delta ;
delete [ ] lesserToAddTo ;
delete [ ] newDeltaSorted ;
delete [ ] newDelta ;
return noConts ;
}
int32_t SortedLines : : detectContractions ( Line * * firstRep , int32_t firstSize ,
Line * * secondRep , int32_t secondSize ,
Line * toAddTo , int32_t & toAddToSize ,
Line * lesserToAddTo , int32_t & lesserToAddToSize ,
int32_t capacity , UErrorCode & status )
{
int32_t noConts = 0 ;
int i = 0 , j = 0 , k = 0 ;
Line lower , upper , trial , toAdd , helper ;
UChar32 firstStart , firstEnd , secondStart ;
UChar NFCTrial [ 256 ] ;
int32_t NFCTrialLen = 0 ;
UBool thai ;
i = - 1 ;
while ( i < firstSize - 1 & & U_SUCCESS ( status ) ) {
i + + ;
if ( ! debug - > isOn ( ) ) {
logger - > log ( " \r Testing %05i/%05i. Found %05i conts. " , i , firstSize , noConts ) ;
}
U16_GET ( firstRep [ i ] - > name , 0 , 0 , firstRep [ i ] - > len , firstStart ) ;
if ( uscript_getScript ( firstStart , & status ) = = USCRIPT_HAN | | firstRep [ i ] - > strengthFromEmpty > UCOL_PRIMARY ) //UCOL_TERTIARY)
{
continue ;
}
lower = * firstRep [ i ] ;
for ( j = 0 ; j < secondSize ; j + + ) {
if ( noConts = = capacity ) {
return noConts ;
}
U16_GET ( secondRep [ j ] - > name , 0 , 0 , secondRep [ j ] - > len , secondStart ) ;
if ( firstStart = = 0x41 & & secondStart = = 0x308 ) {
int32_t putBreakPointHere = 0 ;
}
if ( uscript_getScript ( secondStart , & status ) = = USCRIPT_HAN ) // || secondRep[j]->strengthFromEmpty > UCOL_TERTIARY)
{
continue ;
}
if ( duplicators & & duplicators - > get ( UnicodeString ( secondRep [ j ] - > name , secondRep [ j ] - > len ) ) ! = NULL ) {
debug - > log ( " Skipping duplicator " ) ;
debug - > log ( secondRep [ j ] - > toString ( ) , TRUE ) ;
continue ;
}
if ( firstRep [ i ] - > name [ 0 ] = = 0x61 & & secondRep [ j ] - > name [ 0 ] = = 0x308 ) {
int32_t putBreakpointhere = 0 ;
}
upper . setToConcat ( firstRep [ i ] , UB [ UCOL_PRIMARY ] ) ;
//upper.setToConcat(firstRep[i], UB[secondRep[j]->strengthFromEmpty]);
toAdd . setToConcat ( firstRep [ i ] , secondRep [ j ] ) ;
U16_GET ( firstRep [ i ] - > name , 0 , firstRep [ i ] - > len - 1 , firstRep [ i ] - > len , firstEnd ) ;
if ( ( thai = u_hasBinaryProperty ( firstEnd , UCHAR_LOGICAL_ORDER_EXCEPTION ) ) ) {
// this means that the lower is single reordering character
// if we do the lower test without taking this into account,
// we'll comparing the secondRep directly to Thai. We add UB[UCOL_PRIMARY] to
// end of lower and in the middle of trial, so we will have
// lower = Thai + UB, trial Thai + UB + x, resolving to
// UB + Thai vs UB + Thai + x.
// for upper bound, we do the similar, so we have
// upper = Thai + UB + UB, trial = Thai + UB + x,
// resolving to UB + Thai + UB vs UB + Thai + x
if ( secondRep [ j ] - > firstCC ) {
UChar32 UBChar ;
U16_GET ( UB [ UCOL_SECONDARY ] - > name , 0 , 0 , UB [ UCOL_SECONDARY ] - > len , UBChar ) ;
if ( secondRep [ j ] - > firstCC > u_getCombiningClass ( UBChar ) ) {
continue ;
}
}
upper = * firstRep [ i ] ;
upper . append ( * UB [ UCOL_PRIMARY ] ) ;
//upper.append(*UB[secondRep[j]->strengthFromEmpty]);
upper . append ( * UB [ UCOL_PRIMARY ] ) ;
lower . append ( * UB [ UCOL_PRIMARY ] ) ;
trial = * firstRep [ i ] ;
trial . append ( * UB [ UCOL_PRIMARY ] ) ;
trial . append ( * secondRep [ j ] ) ;
} else if ( ( firstRep [ i ] - > lastCC > secondRep [ j ] - > firstCC & & secondRep [ j ] - > firstCC & & ! frenchSecondary )
| | ( firstRep [ i ] - > firstCC < secondRep [ j ] - > lastCC & & firstRep [ i ] - > firstCC & & frenchSecondary ) ) {
// Skip because normalization will reorder
// there will be a chance to check this again, since if we
// try a+b, we will also try b+a
continue ;
} else if ( frenchSecondary & & ( firstRep [ i ] - > strengthFromEmpty > UCOL_PRIMARY & & secondRep [ j ] - > strengthFromEmpty > UCOL_PRIMARY ) ) {
continue ;
} else if ( firstRep [ i ] - > lastCC & & secondRep [ j ] - > firstCC & & frenchSecondary ) {
trial . setToConcat ( secondRep [ j ] , firstRep [ i ] ) ;
} else {
trial . setToConcat ( firstRep [ i ] , secondRep [ j ] ) ;
}
// Now let's check the trial. The problem is that when you combine characters,
// you can end up with concatenation that is unknown for the examined API.
NFCTrialLen = unorm_normalize ( trial . name , trial . len , UNORM_NFC , 0 , NFCTrial , 256 , & status ) ;
if ( ( u_strcmp ( trial . name , NFCTrial ) = = 0 ) | | u_strFindLast ( NFCTrial , NFCTrialLen , secondRep [ j ] - > name , secondRep [ j ] - > len ) ) {
if ( secondRep [ j ] - > strengthFromEmpty > UCOL_TERTIARY ) {
continue ;
}
}
UChar32 c ;
U16_GET ( NFCTrial , 0 , 0 , NFCTrialLen , c ) ;
helper . setTo ( c ) ;
if ( probe . distanceFromEmptyString ( helper ) > UCOL_TERTIARY ) {
continue ;
}
if ( NFCTrialLen > 1 ) {
U16_GET ( NFCTrial , 0 , NFCTrialLen - 1 , NFCTrialLen , c ) ;
helper . setTo ( c ) ;
if ( probe . distanceFromEmptyString ( helper ) > UCOL_TERTIARY ) {
continue ;
}
}
if ( probe . compare ( lower , trial ) > = 0 ) { // if lower is bigger than trial
// this might be ok, but I'm having doubts. Here is an additional check:
if ( firstRep [ i ] - > len = = 1 | | secondRep [ j ] - > strengthFromEmpty = = UCOL_PRIMARY ) {
// I'm basically saying that I'll add this kind of contraction for cases where I combine
// one letter with an accent OR when I'm combining more than one symbol with a letter.
noteContraction ( " L " , lesserToAddTo , lesserToAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
}
}
else if ( probe . compare ( trial , upper ) > 0 ) { // trial is bigger than upper??
noteContraction ( " U " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
}
#if 0
else if ( firstRep [ i ] - > strengthFromEmpty = = UCOL_PRIMARY )
{
Line expansionLine ;
if ( getExpansionLine ( trial , * firstRep [ i ] , * secondRep [ j ] , expansionLine ) & &
expansionLine . len & & ! ( expansionLine = = * secondRep [ j ] ) ) {
noteContraction ( " D " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
}
}
# endif
else if ( firstRep [ i ] - > strengthFromEmpty = = UCOL_PRIMARY & & probe . getStrength ( lower , trial ) < secondRep [ j ] - > strengthFromEmpty ) {
noteContraction ( " D1 " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
}
else if ( firstRep [ i ] - > strengthFromEmpty = = UCOL_PRIMARY & & secondRep [ j ] - > strengthFromEmpty = = UCOL_PRIMARY )
{
// I have added an additional check. The checks versus upper and lower bound should be sufficient
// when the right side is a combining mark. There might be a reordering of combining marks, but
// that should be already visible in their order.
// compare the sequence
// Y- <? Y <? Y+
// and
// XY- <? XY <? XY+
Line xym , xyp , xy ;
UBool xymIsContraction = FALSE , toAddIsContraction = FALSE ;
if ( j ) {
if ( ( ( ! secondRep [ j - 1 ] - > firstCC | | firstRep [ i ] - > lastCC < secondRep [ j - 1 ] - > firstCC ) & & ! frenchSecondary )
| | ( ( ! firstRep [ i ] - > firstCC | | firstRep [ i ] - > firstCC > secondRep [ j - 1 ] - > lastCC ) & & frenchSecondary ) ) {
xym . setToConcat ( firstRep [ i ] , secondRep [ j - 1 ] ) ;
toAdd . strength = probe . getStrength ( xym , toAdd ) ;
if ( secondRep [ j ] - > strength ! = toAdd . strength ) {
// there is possibility that either xym or xy are contractions
// There are two situations:
// xym > xy or xym <n xy and ym <k y but n != k
// if they are reordered, we are going to see if each of them
// is further reordered
if ( toAdd . strength = = UCOL_OFF ) {
// check whether toAdd shifted more down
k = j - 2 ;
while ( k > = 0 & & secondRep [ k ] - > strength > secondRep [ j ] - > strength ) {
k - - ;
}
while ( ! toAddIsContraction & & k > = 0 ) {
xyp . setToConcat ( firstRep [ i ] , secondRep [ k ] ) ;
if ( contractionsTable - > get ( UnicodeString ( xyp . name , xyp . len ) ) ! = NULL ) {
k - - ;
continue ;
}
if ( probe . compare ( xyp , xym ) > = 0 ) {
// xyp looks like a contraction
noteContraction ( " !1 " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
toAddIsContraction = TRUE ;
} else {
break ;
}
}
// first let's see if xym has moved beyond
if ( contractionsTable - > get ( UnicodeString ( xym . name , xym . len ) ) = = NULL ) {
k = j + 1 ;
// ignore weaker strengths
while ( k < secondSize & & secondRep [ k ] - > strength > secondRep [ j ] - > strength ) {
k + + ;
}
// check if we skipped the following guy
if ( k < secondSize ) {
xyp . setToConcat ( firstRep [ i ] , secondRep [ k ] ) ;
if ( probe . compare ( xyp , xym ) < = 0 ) {
// xyp looks like a contraction
noteContraction ( " !2 " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j - 1 ] , noConts , status ) ;
xymIsContraction = TRUE ;
}
}
} else {
xymIsContraction = TRUE ;
}
// if they have reordered, but none has moved, then we add them both
// and hope for the best
if ( ! xymIsContraction & & ! toAddIsContraction ) {
// it is possible that there is an NFC version version of one of the
// strings. If we have XY > XZ, but NFC(XZ) = W and X < W, we might have
// have a false contraction.
trial . len = unorm_normalize ( toAdd . name , toAdd . len , UNORM_NFC , 0 , trial . name , 25 , & status ) ;
//UColAttributeValue strength = probe.getStrength(*firstRep[i], trial);
if ( trial = = toAdd ) {
noteContraction ( " !3 " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j - 1 ] , noConts , status ) ;
noteContraction ( " !3 " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
} else {
noteContraction ( " !4 " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
}
}
} else { // only the strength has changed
// check whether the previous is contraction and if not, add the current
if ( contractionsTable - > get ( UnicodeString ( xym . name , xym . len ) ) = = NULL ) {
noteContraction ( " !5 " , toAddTo , toAddToSize , firstRep [ i ] , secondRep [ j ] , noConts , status ) ;
}
}
}
}
}
}
if ( thai ) { // restore lower
lower = * firstRep [ i ] ;
}
}
}
return noConts ;
}
void
SortedLines : : noteContraction ( const char * msg , Line * toAddTo , int32_t & toAddToSize , Line * left , Line * right , int32_t & noConts , UErrorCode & status )
{
Line toAdd ;
toAdd . setToConcat ( left , right ) ;
toAdd . left = left ;
toAdd . right = right ;
// if we're adding an accent to an existing contraction, we want to check
#if 0
Line test , trial1 , trial2 ;
if ( right - > strengthFromEmpty > UCOL_PRIMARY ) {
if ( left - > right & & left - > right - > previous & & left - > right - > next ) {
test . setToConcat ( left - > left , left - > right - > previous ) ;
trial1 . setToConcat ( & test , right ) ;
test . setToConcat ( left - > left , left - > right - > next ) ;
trial2 . setToConcat ( & test , right ) ;
if ( probe . compare ( trial1 , toAdd ) < 0 & & probe . compare ( toAdd , trial2 ) < 0 ) {
// this means that the contraction has been broken by the newly added accent
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
debug - > log ( " Con - " ) ;
debug - > log ( msg ) ;
debug - > log ( toAdd . toString ( FALSE ) , TRUE ) ;
return ;
}
} else {
if ( right - > previous & & right - > next ) {
trial1 . setToConcat ( left , right - > previous ) ;
trial2 . setToConcat ( left , right - > next ) ;
if ( probe . compare ( trial1 , toAdd ) < 0 & & probe . compare ( toAdd , trial2 ) < 0 ) {
// this means that the contraction has been broken by the newly added accent
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
debug - > log ( " Con - " ) ;
debug - > log ( msg ) ;
debug - > log ( toAdd . toString ( FALSE ) , TRUE ) ;
return ;
}
}
if ( left - > previous & & left - > next ) {
trial1 . setToConcat ( left - > previous , right ) ;
trial2 . setToConcat ( left - > next , right ) ;
if ( probe . compare ( trial1 , toAdd ) < 0 & & probe . compare ( toAdd , trial2 ) < 0 ) {
// this means that the contraction has been broken by the newly added accent
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
debug - > log ( " Con - " ) ;
debug - > log ( msg ) ;
debug - > log ( toAdd . toString ( FALSE ) , TRUE ) ;
return ;
}
}
}
}
if ( right - > right & & right - > right - > strengthFromEmpty > UCOL_PRIMARY & & right - > left - > previous & & right - > left - > next ) { // maybe we already had a contraction with an accent
test . setToConcat ( right - > left - > previous , right - > right ) ;
trial1 . setToConcat ( left , & test ) ;
test . setToConcat ( right - > left - > next , right - > right ) ;
trial2 . setToConcat ( left , & test ) ;
if ( probe . compare ( trial1 , toAdd ) < 0 & & probe . compare ( toAdd , trial2 ) < 0 ) {
// this means that the contraction has been broken by the newly added accent
// so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
debug - > log ( " Con - " ) ;
debug - > log ( msg ) ;
debug - > log ( toAdd . toString ( FALSE ) , TRUE ) ;
return ;
}
}
# endif
if ( contractionsTable - > get ( UnicodeString ( toAdd . name , toAdd . len ) ) = = NULL ) {
if ( probe . distanceFromEmptyString ( toAdd ) < = UCOL_TERTIARY ) {
toAddTo [ toAddToSize + + ] = toAdd ;
contractionsTable - > put ( UnicodeString ( toAdd . name , toAdd . len ) , & toAdd , status ) ;
noConts + + ;
debug - > log ( msg ) ;
debug - > log ( " Con + " ) ;
debug - > log ( toAdd . toString ( FALSE ) , TRUE ) ;
if ( ! left - > sortKey ) {
calculateSortKey ( * left ) ;
}
debug - > log ( left - > dumpSortkey ( ) ) ;
debug - > log ( " + " ) ;
if ( ! right - > sortKey ) {
calculateSortKey ( * right ) ;
}
debug - > log ( right - > dumpSortkey ( ) ) ;
debug - > log ( " = " ) ;
calculateSortKey ( toAdd ) ;
debug - > log ( toAdd . dumpSortkey ( ) , TRUE ) ;
if ( noConts > size / 2 ) {
status = U_BUFFER_OVERFLOW_ERROR ;
}
}
}
}
UBool
SortedLines : : getExpansionLine ( const Line & expansion , const Line & previous , const Line & exp , Line & expansionLine )
{
int expIndexSize = 0 ;
UColAttributeValue expStrength = UCOL_OFF ;
int32_t comparisonResult = 0 ;
int32_t i = 0 , k = 0 , prevK = 0 ;
Line trial ;
UBool sequenceCompleted = FALSE ;
int32_t expIndexes [ 256 ] ;
int32_t expIndexesSize = 0 ;
if ( ! sequenceCompleted ) {
expIndexSize = 0 ;
expansionLine . clear ( ) ;
// we will start from strength between the expansion
// and the target (toSort[i] and toSort[j]. First we
// will add as many primaries as possible. Then we will
// try to add secondary pieces and then tertiary.
// found an expansion - what is the expanding sequence?
expStrength = UCOL_PRIMARY ;
while ( ! sequenceCompleted ) {
k = 0 ;
prevK = 0 ;
while ( k < size ) {
if ( expansionLine . len > 15 ) {
sequenceCompleted = TRUE ;
break ;
}
while ( k < size & & toSort [ k ] - > strength ! = UCOL_PRIMARY )
{
k + + ;
}
// nothing found
if ( k = = size ) {
break ;
}
// we need to skip over reordering things. If they were worthy, they would
// have been detected in the previous iteration.
//if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
//k++;
//continue;
//}
trial = previous ;
trial . append ( expansionLine ) ;
trial . append ( * toSort [ k ] ) ;
if ( toSort [ k ] - > name [ 0 ] = = 0x0067 ) {
int32_t putBreakPointHere = 0 ;
}
comparisonResult = probe . compare ( trial , expansion ) ;
if ( comparisonResult = = 0 ) {
expansionLine = * toSort [ k ] ;
return TRUE ;
} else if ( comparisonResult > 0 ) {
if ( prevK ) {
if ( exp = = * toSort [ prevK ] ) {
expansionLine = exp ;
return TRUE ;
}
i = prevK ;
while ( i < k - 1 ) {
i + + ;
if ( toSort [ i ] - > strength > exp . strength ) {
continue ;
}
trial = previous ;
trial . append ( expansionLine ) ;
trial . append ( * toSort [ i ] ) ;
if ( probe . compare ( trial , expansion ) > 0 ) {
break ;
}
}
// we got into situation where we have ch > ch+dot-below
// however, ch is a contraction and therefore we cannot use
// it properly. If we have hit on a contraction, we'll just try
// to continue. Probably need more logic here.
if ( contractionsTable - > get ( UnicodeString ( trial . name , trial . len ) ) = = NULL ) {
expansionLine . append ( * toSort [ i - 1 ] ) ;
expIndexes [ expIndexSize + + ] = i - 1 ;
break ;
} else {
int32_t putBreakPointHere = 0 ;
}
} else {
sequenceCompleted = TRUE ;
break ;
}
//break;
}
prevK = k ;
k + + ;
}
if ( ! prevK | | k = = size ) {
break ;
}
}
}
return expIndexSize > 0 ;
}
int32_t
SortedLines : : gooseUp ( int32_t resetIndex , int32_t expansionIndex , Line & expLine , int32_t * expIndexes , int32_t & expIndexSize , UColAttributeValue strength )
{
int32_t i = expansionIndex , k = resetIndex + 1 , n = 0 , m = 0 , start = 0 ;
UBool haveChanges = FALSE ;
Line trial , prefix , suffix ;
// we will first try goosing up the reset index
//while(toSort[k]->strength >= strength)
for ( ; toSort [ k ] - > strength = = strength ; k + + )
{
//if(toSort[k]->strength > strength) {
//continue;
//}
trial . setToConcat ( toSort [ k ] , & expLine ) ;
if ( probe . compare ( trial , * toSort [ i ] ) > 0 ) {
break ;
}
}
resetIndex = k - 1 ;
// goose up individual characters
prefix = * toSort [ resetIndex ] ;
for ( n = 0 ; n < expIndexSize ; n + + ) {
suffix . clear ( ) ;
for ( m = n + 1 ; m < expIndexSize ; m + + ) {
suffix . append ( * toSort [ expIndexes [ m ] ] ) ;
}
k = expIndexes [ n ] + 1 ;
//while(toSort[k]->strength >= strength)
for ( ; toSort [ k ] - > strength = = strength ; k + + )
{
//if(toSort[k]->strength > strength) {
//continue;
//}
trial . setToConcat ( & prefix , toSort [ k ] ) ;
trial . append ( suffix ) ;
if ( probe . compare ( trial , * toSort [ i ] ) > 0 ) {
break ;
}
}
if ( k > expIndexes [ n ] + 1 ) {
haveChanges = TRUE ;
expIndexes [ n ] = k - 1 ;
}
prefix . append ( * toSort [ expIndexes [ n ] ] ) ;
}
// try inserting ingorables
UColAttributeValue lastStr = UCOL_OFF ;
k = 0 ;
while ( toSort [ k ] - > strengthFromEmpty > strength ) {
k + + ;
}
if ( toSort [ k ] - > strengthFromEmpty = = strength ) {
start = k ;
prefix = * toSort [ resetIndex ] ;
n = 0 ;
while ( n < = expIndexSize ) {
suffix . clear ( ) ;
for ( m = n ; m < expIndexSize ; m + + ) {
suffix . append ( * toSort [ expIndexes [ m ] ] ) ;
}
k = start ;
while ( toSort [ k ] - > strengthFromEmpty = = strength ) {
trial . setToConcat ( & prefix , toSort [ k ] ) ;
trial . append ( suffix ) ;
lastStr = probe . getStrength ( trial , * toSort [ i ] ) ;
if ( lastStr = = UCOL_OFF ) { // shot over - we won't find anything here
break ;
} else if ( lastStr > strength ) {
for ( m = expIndexSize ; m > n ; m - - ) {
expIndexes [ m ] = expIndexes [ m - 1 ] ;
}
expIndexes [ n ] = k ;
expIndexSize + + ;
haveChanges = TRUE ;
break ;
}
#if 0
if ( probe . compare ( trial , * toSort [ i ] ) > 0 ) {
// if the first one skips, that means that
// this position doesn't work
if ( k > start ) {
// insert an ignorable on position n
for ( m = expIndexSize ; m > n ; m - - ) {
expIndexes [ m ] = expIndexes [ m - 1 ] ;
}
expIndexes [ n ] = k - 1 ;
expIndexSize + + ;
haveChanges = TRUE ;
if ( n = = expIndexSize - 1 ) { // added to the end of the string
UColAttributeValue str = probe . getStrength ( trial , * toSort [ i ] ) ;
int32_t putBreakHere = 0 ;
}
}
break ;
} else {
lastStr = probe . getStrength ( trial , * toSort [ i ] ) ;
}
# endif
k + + ;
}
prefix . append ( * toSort [ expIndexes [ n ] ] ) ;
n + + ;
}
}
if ( haveChanges ) {
expLine . clear ( ) ;
for ( m = 0 ; m < expIndexSize ; m + + ) {
expLine . append ( * toSort [ expIndexes [ m ] ] ) ;
}
}
return resetIndex ;
}
int32_t
SortedLines : : detectExpansions ( )
{
logger - > log ( " \n *** Detecting expansions \n \n " ) ;
int32_t exCount = 0 ;
int32_t i = 0 , j = 0 , k = 0 , prevK = 0 ;
Line * previous , trial , expansionLine ;
UBool foundExp = FALSE , sequenceCompleted = FALSE ;
UColAttributeValue strength = UCOL_OFF ;
UColAttributeValue maxStrength = UCOL_IDENTICAL ;
UColAttributeValue expStrength = UCOL_OFF ;
int32_t expIndexes [ 256 ] ;
int32_t expIndexSize = 0 ;
memset ( expIndexes , 0 , sizeof ( expIndexes ) ) ;
// for each element, we look back to find whether there is such a q for which
// q <n x < qUBn. These are possible expansions. When going backwards we skip
// over already detected expansions.
i = 0 ;
// it turns out that looking at accents as possible expansions is
// quite a stupid thing to do, especially on non ICU platforms.
// Previously this line skipped over identicals only, but
// now we are going to skip all the way to non-ignorables.
while ( toSort [ i ] - > strengthFromEmpty > UCOL_PRIMARY ) {
i + + ;
}
i + + ;
for ( ; i < size ; i + + ) {
if ( toSort [ i ] - > name [ 0 ] = = 0x0063 & & toSort [ i ] - > name [ 1 ] = = 0x68 ) // && toSort[i]->name[1] == 0x308)0043 0043 0219
{
int32_t putBreakpointhere = 0 ;
}
foundExp = FALSE ;
sequenceCompleted = FALSE ;
strength = toSort [ i ] - > strength ;
if ( strength = = UCOL_IDENTICAL & & toSort [ i - 1 ] - > isExpansion = = TRUE ) {
u_strcpy ( toSort [ i ] - > expansionString , toSort [ i - 1 ] - > expansionString ) ;
toSort [ i ] - > expLen = toSort [ i - 1 ] - > expLen ;
toSort [ i ] - > isExpansion = TRUE ;
toSort [ i ] - > expIndex = toSort [ i - 1 ] - > expIndex ;
toSort [ i ] - > expStrength = UCOL_IDENTICAL ;
//toSort[i]->expStrength = toSort[i-1]->expStrength;
foundExp = TRUE ;
sequenceCompleted = TRUE ;
}
//logger->log("%i %i\n", i, j);
while ( ! foundExp & & strength < = maxStrength ) {
j = i - 1 ;
while ( j & & ( toSort [ j ] - > isExpansion = = TRUE | | toSort [ j ] - > isRemoved = = TRUE ) ) {
//if(toSort[j]->strength < strength) {
//strength = toSort[j]->strength;
//}
j - - ;
}
//while(j && toSort[j]->strength > strength)
while ( j & & toSort [ j ] - > strength > probe . getStrength ( * toSort [ j ] , * toSort [ i ] ) )
{
j - - ;
}
//if(toSort[j]->strength == strength) {
previous = toSort [ j ] ;
if ( previous - > strengthFromEmpty > = UCOL_IDENTICAL | |
( previous - > strengthFromEmpty = = UCOL_SECONDARY
& & strength = = UCOL_SECONDARY
& & previous - > lastCC > UB [ strength ] - > firstCC ) ) {
break ;
//continue;
}
//trial.setToConcat(previous, UB[strength]);
trial . setToConcat ( previous , UB [ probe . getStrength ( * toSort [ j ] , * toSort [ i ] ) ] ) ;
if ( probe . compare ( trial , * toSort [ i ] ) > 0 ) {
foundExp = TRUE ;
}
//}
if ( strength = = UCOL_QUATERNARY ) {
strength = UCOL_IDENTICAL ;
} else {
strength = ( UColAttributeValue ) ( strength + 1 ) ;
}
}
// calculate the expanding sequence
if ( foundExp & & ! sequenceCompleted ) {
expIndexSize = 0 ;
expansionLine . clear ( ) ;
exCount + + ;
// we will start from strength between the expansion
// and the target (toSort[i] and toSort[j]. First we
// will add as many primaries as possible. Then we will
// try to add secondary pieces and then tertiary.
// found an expansion - what is the expanding sequence?
expStrength = UCOL_PRIMARY ;
while ( ! sequenceCompleted ) {
k = 0 ;
prevK = 0 ;
while ( k < size ) {
if ( expansionLine . len > 15 ) {
sequenceCompleted = TRUE ;
break ;
}
while ( k < size & & toSort [ k ] - > strength ! = UCOL_PRIMARY ) {
k + + ;
}
// nothing found
if ( k = = size ) {
break ;
}
// we need to skip over reordering things. If they were worthy, they would
// have been detected in the previous iteration.
//if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
//k++;
//continue;
//}
trial = * previous ;
trial . append ( expansionLine ) ;
trial . append ( * toSort [ k ] ) ;
if ( toSort [ k ] - > name [ 0 ] = = 0x0067 ) {
int32_t putBreakPointHere = 0 ;
}
if ( probe . compare ( trial , * toSort [ i ] ) > 0 ) {
if ( prevK ) {
// we got into situation where we have ch > ch+dot-below
// however, ch is a contraction and therefore we cannot use
// it properly. If we have hit on a contraction, we'll just try
// to continue. Probably need more logic here.
if ( contractionsTable - > get ( UnicodeString ( trial . name , trial . len ) ) = = NULL ) {
expansionLine . append ( * toSort [ prevK ] ) ;
expIndexes [ expIndexSize + + ] = prevK ;
break ;
} else {
int32_t putBreakPointHere = 0 ;
}
} else {
sequenceCompleted = TRUE ;
break ;
}
//break;
}
prevK = k ;
k + + ;
}
if ( ! prevK | | k = = size ) {
break ;
}
}
// after this we have primaries lined up.
// we are going to goose up with secondaries and
// tertiaries
trial . setToConcat ( toSort [ j ] , & expansionLine ) ;
expStrength = probe . getStrength ( trial , * toSort [ i ] ) ;
if ( expStrength > UCOL_PRIMARY ) {
if ( expStrength = = UCOL_SECONDARY | | expStrength = = UCOL_OFF ) {
j = gooseUp ( j , i , expansionLine , expIndexes , expIndexSize , UCOL_SECONDARY ) ;
trial . setToConcat ( toSort [ j ] , & expansionLine ) ;
expStrength = probe . getStrength ( trial , * toSort [ i ] ) ;
if ( expStrength = = UCOL_TERTIARY ) {
j = gooseUp ( j , i , expansionLine , expIndexes , expIndexSize , UCOL_TERTIARY ) ;
}
} else if ( expStrength = = UCOL_TERTIARY ) {
j = gooseUp ( j , i , expansionLine , expIndexes , expIndexSize , UCOL_TERTIARY ) ;
}
}
trial . setToConcat ( toSort [ j ] , & expansionLine ) ;
expStrength = probe . getStrength ( trial , * toSort [ i ] ) ;
if ( expansionLine . len ) {
if ( expansionLine . name [ 0 ] = = 0x73 & & expansionLine . name [ 1 ] = = 0x7a ) {
int32_t putBreakpointhere = 0 ;
}
UBool isExpansionLineAContraction = ( contractionsTable - > get ( UnicodeString ( expansionLine . name , expansionLine . len ) ) ! = NULL ) ;
// we have an expansion line and an expansion. There could be some expansions where
// the difference between expansion line and the end of expansion sequence is less or
// equal than the expansion strength. These should probably be removed.
int32_t diffLen = toSort [ i ] - > len - expansionLine . len ;
if ( diffLen > 0 ) {
trial . setTo ( UnicodeString ( toSort [ i ] - > name + diffLen , toSort [ i ] - > len - diffLen ) ) ;
} else {
trial = * toSort [ i ] ;
}
UColAttributeValue s1 = probe . getStrength ( trial , expansionLine ) ;
if ( s1 = = UCOL_OFF ) {
s1 = probe . getStrength ( expansionLine , trial ) ;
}
if ( ( ! isExpansionLineAContraction & & s1 > = expStrength ) | | ( diffLen < = 0 & & s1 = = UCOL_IDENTICAL ) ) {
contractionsTable - > remove ( UnicodeString ( toSort [ i ] - > name , toSort [ i ] - > len ) ) ;
toSort [ i ] - > isRemoved = TRUE ;
if ( toSort [ i ] - > next & & toSort [ i ] - > previous ) {
toSort [ i ] - > previous - > next = toSort [ i ] - > next ;
}
if ( toSort [ i ] - > previous & & toSort [ i ] - > next ) {
toSort [ i ] - > next - > previous = toSort [ i ] - > previous ;
}
debug - > log ( " Exp -N: " ) ;
debug - > log ( toSort [ i ] - > toString ( FALSE ) ) ;
debug - > log ( " / " ) ;
debug - > log ( expansionLine . toString ( FALSE ) , TRUE ) ;
}
else
{
u_strncat ( toSort [ i ] - > expansionString , expansionLine . name , expansionLine . len ) ;
toSort [ i ] - > isExpansion = TRUE ;
toSort [ i ] - > expStrength = expStrength ;
toSort [ i ] - > expLen = expansionLine . len ;
toSort [ i ] - > expansionString [ toSort [ i ] - > expLen ] = 0 ;
toSort [ i ] - > expIndex = j ;
}
}
}
if ( toSort [ i ] - > isExpansion = = TRUE ) {
if ( debug - > isOn ( ) ) {
debug - > log ( " Exp + : & " ) ;
debug - > log ( toSort [ j ] - > toString ( FALSE ) ) ;
debug - > log ( toSort [ i ] - > strengthToString ( toSort [ i ] - > expStrength , TRUE ) ) ;
debug - > log ( toSort [ i ] - > toString ( FALSE ) ) ;
debug - > log ( " " ) ;
if ( ! toSort [ j ] - > sortKey ) {
calculateSortKey ( * toSort [ j ] ) ;
}
debug - > log ( toSort [ j ] - > dumpSortkey ( ) ) ;
debug - > log ( " ... " ) ;
if ( ! toSort [ i ] - > sortKey ) {
calculateSortKey ( * toSort [ i ] ) ;
}
debug - > log ( toSort [ i ] - > dumpSortkey ( ) ) ;
calculateSortKey ( expansionLine ) ;
debug - > log ( " / " ) ;
debug - > log ( expansionLine . dumpSortkey ( ) , TRUE ) ;
}
}
}
// after detecting expansions, we want to position them.
// it is better to position expansions after all have been detected,
// since otherwise we will change the ordering.
for ( i = size - 1 ; i > = 0 ; i - - ) {
if ( toSort [ i ] - > isExpansion ) {
if ( toSort [ i ] - > name [ 0 ] = = 0x2A3 ) {
int32_t putBreakPointHere = 0 ;
}
if ( i ) {
if ( toSort [ i ] - > previous ) {
toSort [ i ] - > previous - > next = toSort [ i ] - > next ;
}
}
if ( i < size - 1 ) {
if ( toSort [ i ] - > next ) {
toSort [ i ] - > next - > previous = toSort [ i ] - > previous ;
}
}
j = toSort [ i ] - > expIndex ;
toSort [ i ] - > next = toSort [ j ] - > next ;
toSort [ i ] - > previous = toSort [ j ] ;
toSort [ j ] - > next = toSort [ i ] ;
if ( toSort [ i ] - > next ) {
toSort [ i ] - > next - > previous = toSort [ i ] ;
}
toSort [ i ] - > strength = toSort [ i ] - > expStrength ;
}
}
return exCount ;
}
Line *
SortedLines : : getFirst ( ) {
current = first ;
return current ;
}
Line *
SortedLines : : getLast ( ) {
current = last ;
return current ;
}
void
SortedLines : : add ( Line * line , UBool linkIn ) {
if ( size + + = = capacity ) {
// grow
}
lines [ size ] = * line ;
Line * toAdd = & lines [ size ] ;
if ( linkIn & & first ) {
Line * current = first ;
while ( current ! = NULL & & probe . comparer ( & current , & toAdd ) < 0 ) {
current = current - > next ;
}
if ( current = = NULL ) {
toAdd - > previous = last ;
toAdd - > next = NULL ;
if ( last ! = NULL ) {
last - > next = toAdd ;
}
last = toAdd ;
if ( first = = NULL ) {
first = toAdd ;
}
} else { // current != NULL
toAdd - > next = current ;
toAdd - > previous = current - > previous ;
if ( current - > previous ) {
current - > previous - > next = toAdd ;
} else {
first = toAdd ;
}
current - > previous = toAdd ;
}
}
}
Line *
SortedLines : : getNext ( )
{
if ( current ! = NULL ) {
current = current - > next ;
}
return current ;
}
Line *
SortedLines : : getPrevious ( )
{
if ( current ! = NULL ) {
current = current - > previous ;
}
return current ;
}
Line *
SortedLines : : operator [ ] ( int32_t index )
{
int32_t i = 0 ;
Line * c = first ;
for ( i = 0 ; i < index ; i + + ) {
if ( c ! = NULL ) {
c = c - > next ;
}
}
return c ;
}
UnicodeString
SortedLines : : arrayToString ( Line * * sortedLines , int32_t linesSize , UBool pretty , UBool useLinks , UBool printSortKeys ) {
UnicodeString result ;
int32_t i = 0 ;
Line * line = NULL ;
Line * previous = sortedLines [ 0 ] ;
if ( printSortKeys & & ! sortkeys ) {
printSortKeys = FALSE ;
}
if ( previous - > isReset ) {
result . append ( " & " ) ;
result . append ( previous - > name , previous - > len ) ;
if ( pretty ) {
result . append ( " # " ) ;
result . append ( previous - > stringToName ( previous - > name , previous - > len ) ) ;
result . append ( " \n " ) ;
}
} else if ( ! previous - > isRemoved ) {
result . append ( previous - > toString ( pretty ) ) ;
if ( pretty ) {
result . append ( " \n " ) ;
}
}
i = 1 ;
while ( ( i < linesSize & & ! useLinks ) | | ( previous - > next & & useLinks ) ) {
if ( useLinks ) {
line = previous - > next ;
} else {
line = sortedLines [ i ] ;
}
if ( line - > isReset ) {
result . append ( " & " ) ;
result . append ( line - > name , line - > len ) ;
if ( pretty ) {
result . append ( " # " ) ;
result . append ( line - > stringToName ( line - > name , line - > len ) ) ;
result . append ( " \n " ) ;
}
} else if ( ! line - > isRemoved ) {
if ( i > 0 ) {
result . append ( line - > strengthToString ( line - > strength , pretty ) ) ;
}
result . append ( line - > toString ( pretty ) ) ;
if ( printSortKeys ) {
result . append ( line - > dumpSortkey ( ) ) ;
}
if ( pretty ) {
result . append ( " \n " ) ;
}
}
previous = line ;
i + + ;
}
return result ;
}
SortedLines : : SortedLines ( FILE * file , UPrinter * logger , UPrinter * debug , UErrorCode & status ) :
toSort ( NULL ) ,
toSortCapacity ( 0 ) ,
lines ( NULL ) ,
size ( 0 ) ,
capacity ( 0 ) ,
first ( NULL ) ,
last ( NULL ) ,
logger ( logger ) ,
debug ( debug ) ,
contractionsTable ( NULL ) ,
duplicators ( NULL ) ,
maxExpansionPrefixSize ( 0 ) ,
wordSort ( FALSE ) ,
frenchSecondary ( FALSE ) ,
upperFirst ( FALSE ) ,
sortkeys ( NULL ) ,
sortkeyOffset ( 0 )
{
debug - > log ( " *** loading a dump \n " ) ;
memset ( UB , 0 , sizeof ( UB ) ) ;
int32_t i = 0 ;
for ( i = 0 ; i < UCOL_OFF ; i + + ) {
UB [ i ] = & empty ;
}
int32_t newFrench , newUpperFirst ;
fscanf ( file , " %i,%i,%i \n " , & size , & newFrench , & newUpperFirst ) ;
debug - > log ( " Read size %i, frenchSecondary %i and upperFirst %i \n " , size , newFrench , newUpperFirst ) ;
frenchSecondary = ( UBool ) newFrench ;
upperFirst = ( UBool ) newUpperFirst ;
capacity = size ;
lines = new Line [ capacity ] ;
i = 0 ;
char buff [ 256 ] ;
while ( fgets ( buff , 256 , file ) ) {
if ( i % 20 = = 0 ) {
logger - > log ( " \r Line: %04i " , i , buff ) ;
}
lines [ i ] . initFromString ( buff , 256 , status ) ;
if ( i ) {
lines [ i ] . previous = & lines [ i - 1 ] ;
lines [ i - 1 ] . next = & lines [ i ] ;
}
i + + ;
}
size = i ;
toSort = new Line * [ size ] ;
setSortingArray ( toSort , lines , size ) ;
first = & lines [ 0 ] ;
last = & lines [ size - 1 ] ;
}
void
SortedLines : : toFile ( FILE * file , UBool useLinks , UErrorCode & status )
{
fprintf ( file , " %i,%i,%i \n " , size , frenchSecondary , upperFirst ) ;
int32_t i = 1 ;
Line * previous = toSort [ 0 ] ;
Line * line = NULL ;
char buff [ 256 ] ;
previous - > write ( buff , 256 , status ) ;
fprintf ( file , " %s \n " , buff ) ;
fflush ( file ) ;
while ( previous - > next ) {
if ( useLinks ) {
line = previous - > next ;
} else {
line = toSort [ i ] ;
}
line - > write ( buff , 256 , status ) ;
fprintf ( file , " %s \n " , buff ) ;
i + + ;
previous = line ;
}
}
UnicodeString
SortedLines : : toStringFromEmpty ( ) {
UBool useLinks = FALSE ;
UBool pretty = FALSE ;
UnicodeString result ;
int32_t i = 0 ;
Line * line = NULL ;
Line * previous = toSort [ 0 ] ;
if ( previous - > isReset ) {
result . append ( " & " ) ;
if ( pretty ) {
result . append ( " \n " ) ;
}
result . append ( previous - > name , previous - > len ) ;
} else if ( ! previous - > isRemoved ) {
result . append ( previous - > toString ( pretty ) ) ;
if ( pretty ) {
result . append ( " \n " ) ;
}
}
i = 1 ;
while ( i < size | | previous - > next ) {
if ( useLinks ) {
line = previous - > next ;
} else {
line = toSort [ i ] ;
}
if ( line - > isReset ) {
result . append ( " & " ) ;
result . append ( line - > name , line - > len ) ;
if ( pretty ) {
result . append ( " # " ) ;
result . append ( line - > stringToName ( line - > name , line - > len ) ) ;
result . append ( " \n " ) ;
}
} else if ( ! line - > isRemoved ) {
if ( i > 0 ) {
result . append ( line - > strengthToString ( line - > strengthFromEmpty , pretty ) ) ;
}
result . append ( line - > toString ( pretty ) ) ;
if ( pretty ) {
result . append ( " \n " ) ;
}
}
previous = line ;
i + + ;
}
return result ;
}
UnicodeString
SortedLines : : toString ( UBool useLinks )
{
return arrayToString ( toSort , size , FALSE , useLinks , FALSE ) ;
}
UnicodeString
SortedLines : : toPrettyString ( UBool useLinks , UBool printSortKeys )
{
return arrayToString ( toSort , size , TRUE , useLinks , printSortKeys ) ;
}
UnicodeString
SortedLines : : toOutput ( const char * format ,
const char * locale , const char * platform , const char * reference ,
UBool useLinks , UBool initialize , UBool moreToCome ) {
if ( strcmp ( format , " HTML " ) = = 0 ) {
return toHTML ( locale , platform , reference , useLinks , initialize , moreToCome ) ;
} else if ( strcmp ( format , " XML " ) = = 0 ) {
return toXML ( locale , platform , reference , useLinks , initialize , moreToCome ) ;
} else {
return toBundle ( locale , platform , reference , useLinks , initialize , moreToCome ) ;
}
}
UnicodeString
SortedLines : : toHTML ( const char * locale ,
const char * platform , const char * reference ,
UBool useLinks , UBool initialize , UBool moreToCome )
{
UnicodeString result ;
int32_t i = 0 ;
if ( initialize ) {
result . append ( " <html> \n <head> \n <meta http-equiv= \" content-type \" content= \" text/html; charset=utf-8 \" > \n </head> \n " ) ;
result . append ( " # Collation data resource bundle generated for locale: " ) ;
result . append ( locale ) ;
result . append ( " <br> \n # For platform " ) ;
result . append ( platform ) ;
result . append ( " reference platform " ) ;
result . append ( reference ) ;
result . append ( " <br><br> \n \n \n " ) ;
result . append ( locale ) ;
if ( platform ) {
result . append ( " _ " ) ;
result . append ( platform ) ;
}
if ( reference ) {
result . append ( " _vs_ " ) ;
result . append ( reference ) ;
}
result . append ( " {<br> \n " ) ;
result . append ( " collations {<br> \n standard {<br> \n Sequence {<br> \n " ) ;
}
if ( frenchSecondary ) {
result . append ( " [backwards 2]<br> \n " ) ;
}
if ( upperFirst ) {
result . append ( " [casefirst upper]<br> \n " ) ;
}
Line * line = toSort [ 0 ] ;
i = 0 ;
while ( ( i < size & & ! useLinks ) | | ( line - > next & & useLinks ) ) {
if ( line - > isReset | | ! line - > isRemoved ) {
result . append ( line - > toHTMLString ( ) ) ;
}
i + + ;
if ( useLinks ) {
line = line - > next ;
} else {
line = toSort [ i ] ;
}
}
if ( ! moreToCome ) {
result . append ( " }<br> \n }<br> \n }<br> \n }<br> \n " ) ;
result . append ( " </html> \n " ) ;
}
return result ;
}
UnicodeString
SortedLines : : toXML ( const char * locale ,
const char * platform , const char * reference ,
UBool useLinks , UBool initialize , UBool moreToCome )
{
UnicodeString result ;
int32_t i = 0 ;
if ( initialize ) {
result . append ( " <html> \n <head> \n <meta http-equiv= \" content-type \" content= \" text/html; charset=utf-8 \" > \n </head> \n " ) ;
result . append ( " # Collation data resource bundle generated for locale: " ) ;
result . append ( locale ) ;
result . append ( " <br> \n # For platform " ) ;
result . append ( platform ) ;
result . append ( " reference platform " ) ;
result . append ( reference ) ;
result . append ( " <br><br> \n \n \n " ) ;
result . append ( locale ) ;
if ( platform ) {
result . append ( " _ " ) ;
result . append ( platform ) ;
}
if ( reference ) {
result . append ( " _vs_ " ) ;
result . append ( reference ) ;
}
result . append ( " {<br> \n " ) ;
result . append ( " collations {<br> \n standard {<br> \n Sequence {<br> \n " ) ;
}
if ( frenchSecondary ) {
result . append ( " [backwards 2]<br> \n " ) ;
}
if ( upperFirst ) {
result . append ( " [casefirst upper]<br> \n " ) ;
}
Line * line = toSort [ 0 ] ;
i = 0 ;
while ( ( i < size & & ! useLinks ) | | ( line - > next & & useLinks ) ) {
if ( line - > isReset | | ! line - > isRemoved ) {
result . append ( line - > toHTMLString ( ) ) ;
}
i + + ;
if ( useLinks ) {
line = line - > next ;
} else {
line = toSort [ i ] ;
}
}
if ( ! moreToCome ) {
result . append ( " }<br> \n }<br> \n }<br> \n }<br> \n " ) ;
result . append ( " </html> \n " ) ;
}
return result ;
}
UnicodeString
SortedLines : : toBundle ( const char * locale ,
const char * platform , const char * reference ,
UBool useLinks , UBool initialize , UBool moreToCome )
{
UnicodeString result ;
int32_t i = 0 ;
if ( initialize ) {
result . append ( " // Collation data resource bundle generated for locale: " ) ;
result . append ( locale ) ;
result . append ( " \n // For platform " ) ;
result . append ( platform ) ;
result . append ( " reference platform " ) ;
result . append ( reference ) ;
result . append ( " \n \n \n " ) ;
result . append ( locale ) ;
/*
if ( platform ) {
result . append ( " _ " ) ;
result . append ( platform ) ;
}
if ( reference ) {
result . append ( " _vs_ " ) ;
result . append ( reference ) ;
}
*/
result . append ( " { \n " ) ;
result . append ( " collations { \n standard { \n Sequence { \n " ) ;
}
if ( frenchSecondary ) {
result . append ( " [backwards 2] \n " ) ;
}
if ( upperFirst ) {
result . append ( " [casefirst upper] \n " ) ;
}
Line * line = toSort [ 0 ] ;
i = 0 ;
while ( ( i < size & & ! useLinks ) | | ( line - > next & & useLinks ) ) {
if ( line - > isReset | | ! line - > isRemoved ) {
result . append ( line - > toBundleString ( ) ) ;
}
i + + ;
if ( useLinks ) {
line = line - > next ;
} else {
line = toSort [ i ] ;
}
}
if ( ! moreToCome ) {
result . append ( " } \n } \n } \n } \n " ) ;
}
return result ;
}
int32_t
SortedLines : : getSize ( ) const {
return repertoire . size ( ) ;
}
void
SortedLines : : reduceDifference ( SortedLines & reference ) {
UErrorCode status = U_ZERO_ERROR ;
if ( upperFirst ) {
swapCase ( ) ;
}
// both sorted lines structures need to have established links and strengths
// We walk down both structures and note differences. These
// differences will modify this by removng elements, setting resets
// etc...
// we will prefer insertions from tailoring to reference, then deletions
// there are two tables that keep seen elements.
Hashtable * seenThis = new Hashtable ( ) ;
Hashtable * seenReference = new Hashtable ( ) ;
UBool found = FALSE ;
UBool finished = FALSE ;
const int32_t lookForward = 20 ;
int32_t tailoringMove = 0 ;
//int32_t referenceSize = reference.getSize();
Line * refLine = reference . getFirst ( ) ;
Line * refLatestEqual = refLine ;
refLine = refLine - > next ;
Line * myLine = getFirst ( ) ;
Line * myLatestEqual = myLine ;
myLatestEqual - > isRemoved = TRUE ;
myLine = myLine - > next ;
while ( myLine & & refLine ) {
found = FALSE ;
while ( myLine & & refLine & & myLine - > equals ( * refLine ) ) {
myLatestEqual = myLine ;
myLatestEqual - > isRemoved = TRUE ;
myLine = myLine - > next ;
refLatestEqual = refLine ;
refLine = refLine - > next ;
if ( refLine = = NULL & & myLine = = NULL ) {
finished = TRUE ;
}
}
if ( myLine ) {
myLine - > cumulativeStrength = myLine - > strength ;
}
if ( refLine ) {
refLine - > cumulativeStrength = refLine - > strength ;
}
// here is the difference
while ( ! found & & ! finished ) {
tailoringMove = 0 ;
if ( myLine & & refLine ) {
if ( myLine - > cumulativeStrength > refLine - > cumulativeStrength ) {
// tailoring z <<< x, UCA z < y
while ( myLine - > cumulativeStrength > refLine - > cumulativeStrength ) {
myLine = myLine - > next ;
if ( myLine ) {
transferCumulativeStrength ( myLine - > previous , myLine ) ;
} else {
break ;
}
}
} else if ( myLine - > cumulativeStrength < refLine - > cumulativeStrength ) {
// tailoring z < x, UCA z <<< y
while ( myLine - > cumulativeStrength < refLine - > cumulativeStrength ) {
seenReference - > put ( UnicodeString ( refLine - > name , refLine - > len ) , refLine , status ) ;
refLine = refLine - > next ;
if ( refLine ) {
transferCumulativeStrength ( refLine - > previous , refLine ) ;
} else {
break ;
}
}
}
// this is the interesting point. Now we search for character match
while ( myLine & & refLine & & ( ! myLine - > equals ( * refLine ) | | myLine - > strength = = UCOL_IDENTICAL )
& & tailoringMove < lookForward ) {
if ( seenThis - > get ( UnicodeString ( refLine - > name , refLine - > len ) ) ) {
// we are not interested in stuff from the reference that is already accounted
// for in the tailoring.
refLine = refLine - > next ;
if ( refLine ) {
transferCumulativeStrength ( refLine - > previous , refLine ) ;
}
} else {
myLine = myLine - > next ;
if ( myLine ) {
transferCumulativeStrength ( myLine - > previous , myLine ) ;
if ( ! seenReference - > get ( UnicodeString ( myLine - > name , myLine - > len ) ) ) {
tailoringMove + + ;
}
}
}
}
}
if ( refLine = = NULL ) { // ran out of reference
// this is the tail of tailoring - the last insertion
myLine = NULL ;
found = TRUE ;
} else if ( tailoringMove = = lookForward | | myLine = = NULL ) { // run over treshold or out of tailoring
tailoringMove = 0 ;
// we didn't find insertion after all
// we will try substitution next
// reset the tailoring pointer
myLine = myLatestEqual - > next ;
// move the reference
refLine = refLine - > next ;
if ( refLine ) {
transferCumulativeStrength ( refLine - > previous , refLine ) ;
}
} else { // we found an insertion
tailoringMove = 0 ;
if ( myLine - > strength ! = refLine - > strength ) {
while ( myLine & & refLine & & * myLine = = * refLine
& & ( myLine - > strength ! = refLine - > strength
| | myLine - > strength = = UCOL_IDENTICAL ) ) {
myLine = myLine - > next ;
refLine = refLine - > next ;
}
if ( * myLine ! = * refLine ) {
continue ;
}
}
if ( myLine & & refLine & & myLine - > previous - > strength < myLine - > strength ) {
myLine = myLine - > next ;
refLine = refLine - > next ;
if ( * myLine ! = * refLine ) {
continue ;
}
}
found = TRUE ;
}
if ( found ) {
if ( myLatestEqual - > next ! = myLine | | refLine = = NULL ) {
Line * myStart = NULL ;
// this is a reset and a sequence
// myLatestEqual points at the last point that was the same
// This point will be a reset
if ( myLine & & refLine ) { // if there is anything more to do - it might be worth saving it
myStart = myLatestEqual ;
while ( myStart ! = myLine ) {
seenThis - > put ( UnicodeString ( myStart - > name , myStart - > len ) , myStart , status ) ;
myStart = myStart - > next ;
}
}
// Try to weed out stuff that is not affected, like:
// Tailoring:
// <<<S<<\u017F<\u0161<<<\u0160<t
// UCA:
// <<<S<<\u0161<<<\u0160<<\u017F<t
// Result:
// &S<<\u017F<\u0161<<<\u0160
// we have a sequence that spans from myLatestEqual to myLine (that one could be NULL,
// so we have to go down from myLatestEqual.
// Basically, for every element, we want to see the strongest cumulative difference
// from the reset point. If the cumulative difference is the same in both the reference and
// tailoring, that element could be removed.
calculateCumulativeStrengths ( myLatestEqual , myLine ) ;
calculateCumulativeStrengths ( refLatestEqual , refLine ) ;
myStart = myLatestEqual ;
int32_t removed = 0 ;
int32_t traversed = 0 ;
while ( myStart & & myStart ! = myLine ) {
Line * refStart = refLatestEqual ;
while ( refStart & & refStart ! = refLine ) {
if ( * myStart = = * refStart ) {
if ( myStart - > cumulativeStrength = = refStart - > cumulativeStrength ) {
myStart - > isRemoved = TRUE ;
removed + + ;
}
}
refStart = refStart - > next ;
}
myStart = myStart - > next ;
traversed + + ;
}
if ( removed < traversed ) {
myLatestEqual - > isReset = TRUE ;
myLatestEqual - > isRemoved = FALSE ;
}
myLatestEqual = myLine ;
}
}
}
}
if ( upperFirst ) {
//swapCase();
}
delete seenThis ;
delete seenReference ;
}
void
SortedLines : : transferCumulativeStrength ( Line * previous , Line * that ) {
if ( that - > strength > previous - > cumulativeStrength ) {
that - > cumulativeStrength = previous - > cumulativeStrength ;
} else {
that - > cumulativeStrength = that - > strength ;
}
}
void
SortedLines : : calculateCumulativeStrengths ( Line * start , Line * end ) {
// start is a reset - end may be NULL
start = start - > next ;
UColAttributeValue cumulativeStrength = UCOL_OFF ;
while ( start & & start ! = end ) {
if ( start - > strength < cumulativeStrength ) {
cumulativeStrength = start - > strength ;
}
start - > cumulativeStrength = cumulativeStrength ;
start = start - > next ;
}
}
void
SortedLines : : getRepertoire ( UnicodeSet & fillIn ) {
fillIn . clear ( ) ;
fillIn . addAll ( repertoire ) ;
}
void
SortedLines : : removeDecompositionsFromRepertoire ( ) {
UnicodeSetIterator repertoireIter ( repertoire ) ;
UErrorCode status = U_ZERO_ERROR ;
UChar string [ 256 ] ;
UChar composed [ 256 ] ;
int32_t len = 0 , compLen = 0 ;
UnicodeString compString ;
UnicodeSet toRemove ;
while ( repertoireIter . next ( ) ) {
len = 0 ;
if ( repertoireIter . isString ( ) ) { // process a string
len = repertoireIter . getString ( ) . length ( ) ;
u_memcpy ( string , repertoireIter . getString ( ) . getBuffer ( ) , len ) ;
} else { // process code point
UBool isError = FALSE ;
U16_APPEND ( string , len , 25 , repertoireIter . getCodepoint ( ) , isError ) ;
}
string [ len ] = 0 ; // zero terminate, for our evil ways
compLen = unorm_normalize ( string , len , UNORM_NFC , 0 , composed , 256 , & status ) ;
if ( compLen ! = len | | u_strcmp ( string , composed ) ! = 0 ) {
compString . setTo ( composed , compLen ) ;
if ( repertoire . contains ( compString ) ) {
toRemove . add ( UnicodeString ( string , len ) ) ;
}
}
}
debug - > log ( " \n Removing \n " ) ;
debug - > log ( toRemove . toPattern ( compString , TRUE ) , TRUE ) ;
repertoire . removeAll ( toRemove ) ;
}
void
SortedLines : : swapCase ( )
{
int32_t i = 0 ;
for ( i = 0 ; i < size ; i + + ) {
toSort [ i ] - > swapCase ( ) ;
}
}
void
SortedLines : : calculateSortKey ( Line & line )
{
if ( ! sortkeys ) {
sortkeys = new uint8_t [ size * 1024 ] ;
memset ( sortkeys , 0 , size * 1024 ) ;
}
line . sortKey = sortkeys + sortkeyOffset ;
sortkeyOffset + = probe . getSortKey ( line , sortkeys + sortkeyOffset , size * 256 - sortkeyOffset ) ;
}
void
SortedLines : : calculateSortKeys ( )
{
if ( sortkeys ) {
delete [ ] sortkeys ;
}
sortkeyOffset = 0 ;
sortkeys = new uint8_t [ size * 256 ] ;
memset ( sortkeys , 0 , size * 256 ) ;
int32_t i = 0 ;
for ( i = 0 ; i < size ; i + + ) {
calculateSortKey ( * toSort [ i ] ) ;
}
}