ICU-4165 API for finding contractions and expansions - C version
X-SVN-Rev: 17952
This commit is contained in:
parent
91ce50bc16
commit
2e725da4c6
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
* Copyright (C) 2004, International Business Machines
|
* Copyright (C) 2004-2005, International Business Machines
|
||||||
* Corporation and others. All Rights Reserved.
|
* Corporation and others. All Rights Reserved.
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
* file name: ucol_sit.cpp
|
* file name: ucol_sit.cpp
|
||||||
@ -461,7 +461,7 @@ ucol_openFromShortString( const char *definition,
|
|||||||
uprv_memset(buffer, 0, internalBufferSize);
|
uprv_memset(buffer, 0, internalBufferSize);
|
||||||
uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
|
uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
|
||||||
|
|
||||||
UCollator *result = ucol_open(s.locale, status);
|
UCollator *result = ucol_open(buffer, status);
|
||||||
int32_t i = 0;
|
int32_t i = 0;
|
||||||
|
|
||||||
for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
|
for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
|
||||||
@ -882,50 +882,94 @@ ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode
|
|||||||
struct contContext {
|
struct contContext {
|
||||||
const UCollator *coll;
|
const UCollator *coll;
|
||||||
USet *conts;
|
USet *conts;
|
||||||
|
USet *expansions;
|
||||||
USet *removedContractions;
|
USet *removedContractions;
|
||||||
|
UBool addPrefixes;
|
||||||
UErrorCode *status;
|
UErrorCode *status;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
addContraction(const UCollator *coll, USet *contractions, UChar *buffer, int32_t bufLen,
|
addSpecial(contContext *context, UChar *buffer, int32_t bufLen,
|
||||||
uint32_t CE, int32_t rightIndex, UErrorCode *status)
|
uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status)
|
||||||
{
|
{
|
||||||
|
const UCollator *coll = context->coll;
|
||||||
|
USet *contractions = context->conts;
|
||||||
|
USet *expansions = context->expansions;
|
||||||
|
UBool addPrefixes = context->addPrefixes;
|
||||||
|
|
||||||
|
const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
|
||||||
|
uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||||
|
// we might have a contraction that ends from previous level
|
||||||
|
if(newCE != UCOL_NOT_FOUND) {
|
||||||
|
if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) {
|
||||||
|
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
|
||||||
|
}
|
||||||
|
if(rightIndex-leftIndex > 1) {
|
||||||
|
uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
|
||||||
|
if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) {
|
||||||
|
uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
UCharOffset++;
|
||||||
|
// check whether we're doing contraction or prefix
|
||||||
|
if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) {
|
||||||
|
if(leftIndex == 0) {
|
||||||
|
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
--leftIndex;
|
||||||
|
while(*UCharOffset != 0xFFFF) {
|
||||||
|
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||||
|
buffer[leftIndex] = *UCharOffset;
|
||||||
|
if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
|
||||||
|
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
|
||||||
|
} else {
|
||||||
|
uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
|
||||||
|
if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
|
||||||
|
uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
UCharOffset++;
|
||||||
|
}
|
||||||
|
} else if(getCETag(CE) == CONTRACTION_TAG) {
|
||||||
if(rightIndex == bufLen-1) {
|
if(rightIndex == bufLen-1) {
|
||||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
|
|
||||||
uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
|
||||||
// we might have a contraction that ends from previous level
|
|
||||||
if(newCE != UCOL_NOT_FOUND && rightIndex > 1) {
|
|
||||||
uset_addString(contractions, buffer, rightIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
UCharOffset++;
|
|
||||||
while(*UCharOffset != 0xFFFF) {
|
while(*UCharOffset != 0xFFFF) {
|
||||||
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||||
buffer[rightIndex] = *UCharOffset;
|
buffer[rightIndex] = *UCharOffset;
|
||||||
if(isSpecial(newCE) && getCETag(newCE) == CONTRACTION_TAG) {
|
if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
|
||||||
addContraction(coll, contractions, buffer, bufLen, newCE, rightIndex + 1, status);
|
addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status);
|
||||||
} else {
|
} else {
|
||||||
uset_addString(contractions, buffer, rightIndex + 1);
|
uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex);
|
||||||
|
if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
|
||||||
|
uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
UCharOffset++;
|
UCharOffset++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
U_CDECL_BEGIN
|
U_CDECL_BEGIN
|
||||||
static UBool U_CALLCONV
|
static UBool U_CALLCONV
|
||||||
_processContractions(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
|
_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
|
||||||
{
|
{
|
||||||
UErrorCode *status = ((contContext *)context)->status;
|
UErrorCode *status = ((contContext *)context)->status;
|
||||||
USet *unsafe = ((contContext *)context)->conts;
|
USet *contractions = ((contContext *)context)->conts;
|
||||||
|
USet *expansions = ((contContext *)context)->expansions;
|
||||||
USet *removed = ((contContext *)context)->removedContractions;
|
USet *removed = ((contContext *)context)->removedContractions;
|
||||||
const UCollator *coll = ((contContext *)context)->coll;
|
const UCollator *coll = ((contContext *)context)->coll;
|
||||||
|
UBool addPrefixes = ((contContext *)context)->addPrefixes;
|
||||||
UChar contraction[internalBufferSize];
|
UChar contraction[internalBufferSize];
|
||||||
if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG) {
|
if(isSpecial(CE)) {
|
||||||
|
if(contractions && ((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
|
||||||
while(start < limit && U_SUCCESS(*status)) {
|
while(start < limit && U_SUCCESS(*status)) {
|
||||||
// if there are suppressed contractions, we don't
|
// if there are suppressed contractions, we don't
|
||||||
// want to add them.
|
// want to add them.
|
||||||
@ -935,10 +979,15 @@ _processContractions(const void *context, UChar32 start, UChar32 limit, uint32_t
|
|||||||
}
|
}
|
||||||
// we start our contraction from middle, since we don't know if it
|
// we start our contraction from middle, since we don't know if it
|
||||||
// will grow toward right or left
|
// will grow toward right or left
|
||||||
contraction[0] = (UChar)start;
|
contraction[internalBufferSize/2] = (UChar)start;
|
||||||
addContraction(coll, unsafe, contraction, internalBufferSize, CE, 1, status);
|
addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
|
||||||
start++;
|
start++;
|
||||||
}
|
}
|
||||||
|
} else if(expansions && getCETag(CE) == EXPANSION_TAG) {
|
||||||
|
while(start < limit && U_SUCCESS(*status)) {
|
||||||
|
uset_add(expansions, start++);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if(U_FAILURE(*status)) {
|
if(U_FAILURE(*status)) {
|
||||||
return FALSE;
|
return FALSE;
|
||||||
@ -970,47 +1019,63 @@ U_CAPI int32_t U_EXPORT2
|
|||||||
ucol_getContractions( const UCollator *coll,
|
ucol_getContractions( const UCollator *coll,
|
||||||
USet *contractions,
|
USet *contractions,
|
||||||
UErrorCode *status)
|
UErrorCode *status)
|
||||||
|
{
|
||||||
|
int32_t noConts = 0;
|
||||||
|
ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
|
||||||
|
return uset_getItemCount(contractions);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a set containing the expansions defined by the collator. The set includes
|
||||||
|
* both the UCA expansions and the expansions defined by the tailoring
|
||||||
|
* @param coll collator
|
||||||
|
* @param conts the set to hold the result
|
||||||
|
* @param addPrefixes add the prefix contextual elements to contractions
|
||||||
|
* @param status to hold the error code
|
||||||
|
* @return the size of the contraction set
|
||||||
|
*
|
||||||
|
* @draft ICU 3.4
|
||||||
|
*/
|
||||||
|
U_CAPI void U_EXPORT2
|
||||||
|
ucol_getContractionsAndExpansions( const UCollator *coll,
|
||||||
|
USet *contractions,
|
||||||
|
USet *expansions,
|
||||||
|
UBool addPrefixes,
|
||||||
|
UErrorCode *status)
|
||||||
{
|
{
|
||||||
if(U_FAILURE(*status)) {
|
if(U_FAILURE(*status)) {
|
||||||
return 0;
|
return;
|
||||||
}
|
}
|
||||||
if(coll == NULL || contractions == NULL) {
|
if(coll == NULL) {
|
||||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||||
return 0;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
uset_clear(contractions);
|
uset_clear(contractions);
|
||||||
|
if(expansions) {
|
||||||
|
uset_clear(expansions);
|
||||||
|
}
|
||||||
int32_t rulesLen = 0;
|
int32_t rulesLen = 0;
|
||||||
const UChar* rules = ucol_getRules(coll, &rulesLen);
|
const UChar* rules = ucol_getRules(coll, &rulesLen);
|
||||||
UColTokenParser src;
|
UColTokenParser src;
|
||||||
ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status);
|
ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status);
|
||||||
|
|
||||||
contContext c = { NULL, contractions, src.removeSet, status };
|
contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status };
|
||||||
|
|
||||||
coll->mapping->getFoldingOffset = _getTrieFoldingOffset;
|
coll->mapping->getFoldingOffset = _getTrieFoldingOffset;
|
||||||
|
|
||||||
// TODO: if you're supressing contractions in the tailoring
|
|
||||||
// you want to remove (or rather not include) contractions
|
|
||||||
// from the UCA.
|
|
||||||
// Probably want to pass a set of contraction starters that
|
|
||||||
// are suppressed. However, we don't want a dependency on
|
|
||||||
// the builder, so this is going to be hard to pull off.
|
|
||||||
|
|
||||||
// Add the UCA contractions
|
// Add the UCA contractions
|
||||||
c.coll = coll->UCA;
|
c.coll = coll->UCA;
|
||||||
utrie_enum(coll->UCA->mapping, NULL, _processContractions, &c);
|
utrie_enum(coll->UCA->mapping, NULL, _processSpecials, &c);
|
||||||
|
|
||||||
// This is collator specific. Add contractions from a collator
|
// This is collator specific. Add contractions from a collator
|
||||||
c.coll = coll;
|
c.coll = coll;
|
||||||
c.removedContractions = NULL;
|
c.removedContractions = NULL;
|
||||||
utrie_enum(coll->mapping, NULL, _processContractions, &c);
|
utrie_enum(coll->mapping, NULL, _processSpecials, &c);
|
||||||
ucol_tok_closeTokenList(&src);
|
ucol_tok_closeTokenList(&src);
|
||||||
|
|
||||||
return uset_getItemCount(contractions);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
U_CAPI int32_t U_EXPORT2
|
U_CAPI int32_t U_EXPORT2
|
||||||
ucol_getUnsafeSet( const UCollator *coll,
|
ucol_getUnsafeSet( const UCollator *coll,
|
||||||
USet *unsafe,
|
USet *unsafe,
|
||||||
|
@ -343,13 +343,28 @@ ucol_openFromShortString( const char *definition,
|
|||||||
* @param status to hold the error code
|
* @param status to hold the error code
|
||||||
* @return the size of the contraction set
|
* @return the size of the contraction set
|
||||||
*
|
*
|
||||||
* @draft ICU 3.0
|
* @deprecated ICU 3.4, use ucol_getContractionsAndExpansions instead
|
||||||
*/
|
*/
|
||||||
U_CAPI int32_t U_EXPORT2
|
U_CAPI int32_t U_EXPORT2
|
||||||
ucol_getContractions( const UCollator *coll,
|
ucol_getContractions( const UCollator *coll,
|
||||||
USet *conts,
|
USet *conts,
|
||||||
UErrorCode *status);
|
UErrorCode *status);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a set containing the expansions defined by the collator. The set includes
|
||||||
|
* both the UCA expansions and the expansions defined by the tailoring
|
||||||
|
* @param coll collator
|
||||||
|
* @param contractions if not NULL, the set to hold the contractions
|
||||||
|
* @param expansions if not NULL, the set to hold the expansions
|
||||||
|
* @param addPrefixes add the prefix contextual elements to contractions
|
||||||
|
* @param status to hold the error code
|
||||||
|
*
|
||||||
|
* @draft ICU 3.4
|
||||||
|
*/
|
||||||
|
U_CAPI void U_EXPORT2
|
||||||
|
ucol_getContractionsAndExpansions( const UCollator *coll,
|
||||||
|
USet *contractions, USet *expansions,
|
||||||
|
UBool addPrefixes, UErrorCode *status);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Close a UCollator.
|
* Close a UCollator.
|
||||||
|
Loading…
Reference in New Issue
Block a user