ICU-1245 use NFKC for prefix analysis in Japanese

X-SVN-Rev: 5982
This commit is contained in:
Vladimir Weinstein 2001-10-01 02:58:26 +00:00
parent 8ac6cb4b40
commit 0180438c87
2 changed files with 16 additions and 10 deletions

View File

@ -27,6 +27,7 @@
#include "unicode/tblcoll.h"
#include "unicode/coleitr.h"
#include "unicode/unorm.h"
#include "unicode/normlzr.h"
#include "unicode/udata.h"
#include "unormimp.h"
@ -1826,25 +1827,27 @@ uint32_t getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate
// Contraction tables are used - so the whole process is not unlike contraction.
// prefix data is stored backwards in the table.
const UChar *UCharOffset;
UChar tchar, *sourcePointer = source->pos;
UChar schar, tchar, *sourcePointer = source->pos;
Normalizer n(source->string, source->pos-source->string, UNORM_NFKC);
n.last();
for(;;) {
// This loop will run once per source string character, for as long as we
// are matching a potential contraction sequence
// First we position ourselves at the begining of contraction sequence
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
if(sourcePointer != source->string) {
--sourcePointer;
} else {
// Ran off the beggining of the source string.
schar = (UChar)n.previous();
if(schar==Normalizer::DONE) {
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
break;
}
while(*(sourcePointer) > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset++;
}
if (*(sourcePointer) == tchar) {
if (schar == tchar) {
// Found the source string char in the table.
// Pick up the corresponding CE from the table.
CE = *(coll->contractionCEs +

View File

@ -765,10 +765,13 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
el.prefix = el.prefixChars;
el.cPoints = el.uchars;
if(tok->prefix != 0) { // adjust the source if there is a prefix
el.prefixSize = (tok->prefix>>24);
for(i = 0; i < tok->prefix>>24; i++) { // prefixes are going to be looked up backwards
// need to normalize to NFKC first
UChar buffNFKC[256];
el.prefixSize = unorm_normalize(src->source+(tok->prefix&0x00FFFFFF), tok->prefix>>24, UNORM_NFKC, 0, buffNFKC, 256, status);
for(i = 0; i < el.prefixSize; i++) { // prefixes are going to be looked up backwards
// therefore, we will promptly reverse the prefix buffer...
el.prefix[i] = *(src->source+(tok->prefix& 0x00FFFFFF)+(tok->prefix>>24)-i-1);
//el.prefix[i] = *(src->source+(tok->prefix& 0x00FFFFFF)+(tok->prefix>>24)-i-1);
el.prefix[i] = *(buffNFKC+el.prefixSize-i-1);
}
//uprv_memcpy(el.prefix, (tok->prefix & 0x00FFFFFF) + src->source, el.prefixSize*sizeof(UChar));