ICU-4165 API for finding contractions and expansions - C version

X-SVN-Rev: 17952
This commit is contained in:
Vladimir Weinstein 2005-06-20 22:58:50 +00:00
parent 91ce50bc16
commit 2e725da4c6
2 changed files with 122 additions and 42 deletions

View File

@ -1,6 +1,6 @@
/* /*
******************************************************************************* *******************************************************************************
* Copyright (C) 2004, International Business Machines * Copyright (C) 2004-2005, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
******************************************************************************* *******************************************************************************
* file name: ucol_sit.cpp * file name: ucol_sit.cpp
@ -461,7 +461,7 @@ ucol_openFromShortString( const char *definition,
uprv_memset(buffer, 0, internalBufferSize); uprv_memset(buffer, 0, internalBufferSize);
uloc_canonicalize(s.locale, buffer, internalBufferSize, status); uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
UCollator *result = ucol_open(s.locale, status); UCollator *result = ucol_open(buffer, status);
int32_t i = 0; int32_t i = 0;
for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
@ -882,50 +882,94 @@ ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode
struct contContext { struct contContext {
const UCollator *coll; const UCollator *coll;
USet *conts; USet *conts;
USet *expansions;
USet *removedContractions; USet *removedContractions;
UBool addPrefixes;
UErrorCode *status; UErrorCode *status;
}; };
static void static void
addContraction(const UCollator *coll, USet *contractions, UChar *buffer, int32_t bufLen, addSpecial(contContext *context, UChar *buffer, int32_t bufLen,
uint32_t CE, int32_t rightIndex, UErrorCode *status) uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status)
{ {
const UCollator *coll = context->coll;
USet *contractions = context->conts;
USet *expansions = context->expansions;
UBool addPrefixes = context->addPrefixes;
const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
// we might have a contraction that ends from previous level
if(newCE != UCOL_NOT_FOUND) {
if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) {
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
}
if(rightIndex-leftIndex > 1) {
uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) {
uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
}
}
}
UCharOffset++;
// check whether we're doing contraction or prefix
if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) {
if(leftIndex == 0) {
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
--leftIndex;
while(*UCharOffset != 0xFFFF) {
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
buffer[leftIndex] = *UCharOffset;
if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
} else {
uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
}
}
UCharOffset++;
}
} else if(getCETag(CE) == CONTRACTION_TAG) {
if(rightIndex == bufLen-1) { if(rightIndex == bufLen-1) {
*status = U_INTERNAL_PROGRAM_ERROR; *status = U_INTERNAL_PROGRAM_ERROR;
return; return;
} }
const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
// we might have a contraction that ends from previous level
if(newCE != UCOL_NOT_FOUND && rightIndex > 1) {
uset_addString(contractions, buffer, rightIndex);
}
UCharOffset++;
while(*UCharOffset != 0xFFFF) { while(*UCharOffset != 0xFFFF) {
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
buffer[rightIndex] = *UCharOffset; buffer[rightIndex] = *UCharOffset;
if(isSpecial(newCE) && getCETag(newCE) == CONTRACTION_TAG) { if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
addContraction(coll, contractions, buffer, bufLen, newCE, rightIndex + 1, status); addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status);
} else { } else {
uset_addString(contractions, buffer, rightIndex + 1); uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex);
if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex);
}
} }
UCharOffset++; UCharOffset++;
} }
}
} }
U_CDECL_BEGIN U_CDECL_BEGIN
static UBool U_CALLCONV static UBool U_CALLCONV
_processContractions(const void *context, UChar32 start, UChar32 limit, uint32_t CE) _processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
{ {
UErrorCode *status = ((contContext *)context)->status; UErrorCode *status = ((contContext *)context)->status;
USet *unsafe = ((contContext *)context)->conts; USet *contractions = ((contContext *)context)->conts;
USet *expansions = ((contContext *)context)->expansions;
USet *removed = ((contContext *)context)->removedContractions; USet *removed = ((contContext *)context)->removedContractions;
const UCollator *coll = ((contContext *)context)->coll; const UCollator *coll = ((contContext *)context)->coll;
UBool addPrefixes = ((contContext *)context)->addPrefixes;
UChar contraction[internalBufferSize]; UChar contraction[internalBufferSize];
if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG) { if(isSpecial(CE)) {
if(contractions && ((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
while(start < limit && U_SUCCESS(*status)) { while(start < limit && U_SUCCESS(*status)) {
// if there are suppressed contractions, we don't // if there are suppressed contractions, we don't
// want to add them. // want to add them.
@ -935,10 +979,15 @@ _processContractions(const void *context, UChar32 start, UChar32 limit, uint32_t
} }
// we start our contraction from middle, since we don't know if it // we start our contraction from middle, since we don't know if it
// will grow toward right or left // will grow toward right or left
contraction[0] = (UChar)start; contraction[internalBufferSize/2] = (UChar)start;
addContraction(coll, unsafe, contraction, internalBufferSize, CE, 1, status); addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
start++; start++;
} }
} else if(expansions && getCETag(CE) == EXPANSION_TAG) {
while(start < limit && U_SUCCESS(*status)) {
uset_add(expansions, start++);
}
}
} }
if(U_FAILURE(*status)) { if(U_FAILURE(*status)) {
return FALSE; return FALSE;
@ -970,47 +1019,63 @@ U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll, ucol_getContractions( const UCollator *coll,
USet *contractions, USet *contractions,
UErrorCode *status) UErrorCode *status)
{
int32_t noConts = 0;
ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
return uset_getItemCount(contractions);
}
/**
* Get a set containing the expansions defined by the collator. The set includes
* both the UCA expansions and the expansions defined by the tailoring
* @param coll collator
* @param conts the set to hold the result
* @param addPrefixes add the prefix contextual elements to contractions
* @param status to hold the error code
* @return the size of the contraction set
*
* @draft ICU 3.4
*/
U_CAPI void U_EXPORT2
ucol_getContractionsAndExpansions( const UCollator *coll,
USet *contractions,
USet *expansions,
UBool addPrefixes,
UErrorCode *status)
{ {
if(U_FAILURE(*status)) { if(U_FAILURE(*status)) {
return 0; return;
} }
if(coll == NULL || contractions == NULL) { if(coll == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR; *status = U_ILLEGAL_ARGUMENT_ERROR;
return 0; return;
} }
uset_clear(contractions); uset_clear(contractions);
if(expansions) {
uset_clear(expansions);
}
int32_t rulesLen = 0; int32_t rulesLen = 0;
const UChar* rules = ucol_getRules(coll, &rulesLen); const UChar* rules = ucol_getRules(coll, &rulesLen);
UColTokenParser src; UColTokenParser src;
ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status); ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status);
contContext c = { NULL, contractions, src.removeSet, status }; contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status };
coll->mapping->getFoldingOffset = _getTrieFoldingOffset; coll->mapping->getFoldingOffset = _getTrieFoldingOffset;
// TODO: if you're supressing contractions in the tailoring
// you want to remove (or rather not include) contractions
// from the UCA.
// Probably want to pass a set of contraction starters that
// are suppressed. However, we don't want a dependency on
// the builder, so this is going to be hard to pull off.
// Add the UCA contractions // Add the UCA contractions
c.coll = coll->UCA; c.coll = coll->UCA;
utrie_enum(coll->UCA->mapping, NULL, _processContractions, &c); utrie_enum(coll->UCA->mapping, NULL, _processSpecials, &c);
// This is collator specific. Add contractions from a collator // This is collator specific. Add contractions from a collator
c.coll = coll; c.coll = coll;
c.removedContractions = NULL; c.removedContractions = NULL;
utrie_enum(coll->mapping, NULL, _processContractions, &c); utrie_enum(coll->mapping, NULL, _processSpecials, &c);
ucol_tok_closeTokenList(&src); ucol_tok_closeTokenList(&src);
return uset_getItemCount(contractions);
} }
U_CAPI int32_t U_EXPORT2 U_CAPI int32_t U_EXPORT2
ucol_getUnsafeSet( const UCollator *coll, ucol_getUnsafeSet( const UCollator *coll,
USet *unsafe, USet *unsafe,

View File

@ -343,13 +343,28 @@ ucol_openFromShortString( const char *definition,
* @param status to hold the error code * @param status to hold the error code
* @return the size of the contraction set * @return the size of the contraction set
* *
* @draft ICU 3.0 * @deprecated ICU 3.4, use ucol_getContractionsAndExpansions instead
*/ */
U_CAPI int32_t U_EXPORT2 U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll, ucol_getContractions( const UCollator *coll,
USet *conts, USet *conts,
UErrorCode *status); UErrorCode *status);
/**
* Get a set containing the expansions defined by the collator. The set includes
* both the UCA expansions and the expansions defined by the tailoring
* @param coll collator
* @param contractions if not NULL, the set to hold the contractions
* @param expansions if not NULL, the set to hold the expansions
* @param addPrefixes add the prefix contextual elements to contractions
* @param status to hold the error code
*
* @draft ICU 3.4
*/
U_CAPI void U_EXPORT2
ucol_getContractionsAndExpansions( const UCollator *coll,
USet *contractions, USet *expansions,
UBool addPrefixes, UErrorCode *status);
/** /**
* Close a UCollator. * Close a UCollator.