ICU-1245 use NFKC for prefix analysis in Japanese
X-SVN-Rev: 5982
This commit is contained in:
parent
8ac6cb4b40
commit
0180438c87
@ -27,6 +27,7 @@
|
||||
#include "unicode/tblcoll.h"
|
||||
#include "unicode/coleitr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "unicode/udata.h"
|
||||
|
||||
#include "unormimp.h"
|
||||
@ -1826,25 +1827,27 @@ uint32_t getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate
|
||||
// Contraction tables are used - so the whole process is not unlike contraction.
|
||||
// prefix data is stored backwards in the table.
|
||||
const UChar *UCharOffset;
|
||||
UChar tchar, *sourcePointer = source->pos;
|
||||
UChar schar, tchar, *sourcePointer = source->pos;
|
||||
Normalizer n(source->string, source->pos-source->string, UNORM_NFKC);
|
||||
n.last();
|
||||
for(;;) {
|
||||
// This loop will run once per source string character, for as long as we
|
||||
// are matching a potential contraction sequence
|
||||
|
||||
// First we position ourselves at the begining of contraction sequence
|
||||
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
|
||||
if(sourcePointer != source->string) {
|
||||
--sourcePointer;
|
||||
} else {
|
||||
// Ran off the beggining of the source string.
|
||||
schar = (UChar)n.previous();
|
||||
|
||||
if(schar==Normalizer::DONE) {
|
||||
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||
break;
|
||||
}
|
||||
|
||||
while(*(sourcePointer) > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
|
||||
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
|
||||
UCharOffset++;
|
||||
}
|
||||
|
||||
if (*(sourcePointer) == tchar) {
|
||||
if (schar == tchar) {
|
||||
// Found the source string char in the table.
|
||||
// Pick up the corresponding CE from the table.
|
||||
CE = *(coll->contractionCEs +
|
||||
|
@ -765,10 +765,13 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
||||
el.prefix = el.prefixChars;
|
||||
el.cPoints = el.uchars;
|
||||
if(tok->prefix != 0) { // adjust the source if there is a prefix
|
||||
el.prefixSize = (tok->prefix>>24);
|
||||
for(i = 0; i < tok->prefix>>24; i++) { // prefixes are going to be looked up backwards
|
||||
// need to normalize to NFKC first
|
||||
UChar buffNFKC[256];
|
||||
el.prefixSize = unorm_normalize(src->source+(tok->prefix&0x00FFFFFF), tok->prefix>>24, UNORM_NFKC, 0, buffNFKC, 256, status);
|
||||
for(i = 0; i < el.prefixSize; i++) { // prefixes are going to be looked up backwards
|
||||
// therefore, we will promptly reverse the prefix buffer...
|
||||
el.prefix[i] = *(src->source+(tok->prefix& 0x00FFFFFF)+(tok->prefix>>24)-i-1);
|
||||
//el.prefix[i] = *(src->source+(tok->prefix& 0x00FFFFFF)+(tok->prefix>>24)-i-1);
|
||||
el.prefix[i] = *(buffNFKC+el.prefixSize-i-1);
|
||||
}
|
||||
//uprv_memcpy(el.prefix, (tok->prefix & 0x00FFFFFF) + src->source, el.prefixSize*sizeof(UChar));
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user