ICU-4165 API for finding contractions and expansions - C version

X-SVN-Rev: 17952
This commit is contained in:
Vladimir Weinstein 2005-06-20 22:58:50 +00:00
parent 91ce50bc16
commit 2e725da4c6
2 changed files with 122 additions and 42 deletions

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2004, International Business Machines
* Copyright (C) 2004-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucol_sit.cpp
@ -461,7 +461,7 @@ ucol_openFromShortString( const char *definition,
uprv_memset(buffer, 0, internalBufferSize);
uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
UCollator *result = ucol_open(s.locale, status);
UCollator *result = ucol_open(buffer, status);
int32_t i = 0;
for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
@ -882,50 +882,94 @@ ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode
struct contContext {
const UCollator *coll;
USet *conts;
USet *expansions;
USet *removedContractions;
UBool addPrefixes;
UErrorCode *status;
};
static void
addContraction(const UCollator *coll, USet *contractions, UChar *buffer, int32_t bufLen,
uint32_t CE, int32_t rightIndex, UErrorCode *status)
addSpecial(contContext *context, UChar *buffer, int32_t bufLen,
uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status)
{
if(rightIndex == bufLen-1) {
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
const UCollator *coll = context->coll;
USet *contractions = context->conts;
USet *expansions = context->expansions;
UBool addPrefixes = context->addPrefixes;
const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
// we might have a contraction that ends from previous level
if(newCE != UCOL_NOT_FOUND && rightIndex > 1) {
uset_addString(contractions, buffer, rightIndex);
}
if(newCE != UCOL_NOT_FOUND) {
if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) {
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
}
if(rightIndex-leftIndex > 1) {
uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) {
uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
}
}
}
UCharOffset++;
while(*UCharOffset != 0xFFFF) {
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
buffer[rightIndex] = *UCharOffset;
if(isSpecial(newCE) && getCETag(newCE) == CONTRACTION_TAG) {
addContraction(coll, contractions, buffer, bufLen, newCE, rightIndex + 1, status);
} else {
uset_addString(contractions, buffer, rightIndex + 1);
}
UCharOffset++;
// check whether we're doing contraction or prefix
if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) {
if(leftIndex == 0) {
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
--leftIndex;
while(*UCharOffset != 0xFFFF) {
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
buffer[leftIndex] = *UCharOffset;
if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
} else {
uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
}
}
UCharOffset++;
}
} else if(getCETag(CE) == CONTRACTION_TAG) {
if(rightIndex == bufLen-1) {
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
while(*UCharOffset != 0xFFFF) {
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
buffer[rightIndex] = *UCharOffset;
if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status);
} else {
uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex);
if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex);
}
}
UCharOffset++;
}
}
}
U_CDECL_BEGIN
static UBool U_CALLCONV
_processContractions(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
{
UErrorCode *status = ((contContext *)context)->status;
USet *unsafe = ((contContext *)context)->conts;
USet *contractions = ((contContext *)context)->conts;
USet *expansions = ((contContext *)context)->expansions;
USet *removed = ((contContext *)context)->removedContractions;
const UCollator *coll = ((contContext *)context)->coll;
UBool addPrefixes = ((contContext *)context)->addPrefixes;
UChar contraction[internalBufferSize];
if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG) {
if(isSpecial(CE)) {
if(contractions && ((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
while(start < limit && U_SUCCESS(*status)) {
// if there are suppressed contractions, we don't
// want to add them.
@ -935,10 +979,15 @@ _processContractions(const void *context, UChar32 start, UChar32 limit, uint32_t
}
// we start our contraction from middle, since we don't know if it
// will grow toward right or left
contraction[0] = (UChar)start;
addContraction(coll, unsafe, contraction, internalBufferSize, CE, 1, status);
contraction[internalBufferSize/2] = (UChar)start;
addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
start++;
}
} else if(expansions && getCETag(CE) == EXPANSION_TAG) {
while(start < limit && U_SUCCESS(*status)) {
uset_add(expansions, start++);
}
}
}
if(U_FAILURE(*status)) {
return FALSE;
@ -970,47 +1019,63 @@ U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll,
USet *contractions,
UErrorCode *status)
{
int32_t noConts = 0;
ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
return uset_getItemCount(contractions);
}
/**
* Get a set containing the expansions defined by the collator. The set includes
* both the UCA expansions and the expansions defined by the tailoring
* @param coll collator
* @param conts the set to hold the result
* @param addPrefixes add the prefix contextual elements to contractions
* @param status to hold the error code
* @return the size of the contraction set
*
* @draft ICU 3.4
*/
U_CAPI void U_EXPORT2
ucol_getContractionsAndExpansions( const UCollator *coll,
USet *contractions,
USet *expansions,
UBool addPrefixes,
UErrorCode *status)
{
if(U_FAILURE(*status)) {
return 0;
return;
}
if(coll == NULL || contractions == NULL) {
if(coll == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
return;
}
uset_clear(contractions);
if(expansions) {
uset_clear(expansions);
}
int32_t rulesLen = 0;
const UChar* rules = ucol_getRules(coll, &rulesLen);
UColTokenParser src;
ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status);
contContext c = { NULL, contractions, src.removeSet, status };
contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status };
coll->mapping->getFoldingOffset = _getTrieFoldingOffset;
// TODO: if you're supressing contractions in the tailoring
// you want to remove (or rather not include) contractions
// from the UCA.
// Probably want to pass a set of contraction starters that
// are suppressed. However, we don't want a dependency on
// the builder, so this is going to be hard to pull off.
// Add the UCA contractions
c.coll = coll->UCA;
utrie_enum(coll->UCA->mapping, NULL, _processContractions, &c);
utrie_enum(coll->UCA->mapping, NULL, _processSpecials, &c);
// This is collator specific. Add contractions from a collator
c.coll = coll;
c.removedContractions = NULL;
utrie_enum(coll->mapping, NULL, _processContractions, &c);
utrie_enum(coll->mapping, NULL, _processSpecials, &c);
ucol_tok_closeTokenList(&src);
return uset_getItemCount(contractions);
}
U_CAPI int32_t U_EXPORT2
ucol_getUnsafeSet( const UCollator *coll,
USet *unsafe,

View File

@ -343,13 +343,28 @@ ucol_openFromShortString( const char *definition,
* @param status to hold the error code
* @return the size of the contraction set
*
* @draft ICU 3.0
* @deprecated ICU 3.4, use ucol_getContractionsAndExpansions instead
*/
U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll,
USet *conts,
UErrorCode *status);
/**
* Get a set containing the expansions defined by the collator. The set includes
* both the UCA expansions and the expansions defined by the tailoring
* @param coll collator
* @param contractions if not NULL, the set to hold the contractions
* @param expansions if not NULL, the set to hold the expansions
* @param addPrefixes add the prefix contextual elements to contractions
* @param status to hold the error code
*
* @draft ICU 3.4
*/
U_CAPI void U_EXPORT2
ucol_getContractionsAndExpansions( const UCollator *coll,
USet *contractions, USet *expansions,
UBool addPrefixes, UErrorCode *status);
/**
* Close a UCollator.