2001-06-20 22:24:42 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
2010-01-06 23:50:03 +00:00
|
|
|
* Copyright (C) 1999-2010, International Business Machines
|
2001-06-20 22:24:42 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
* file name: store.c
|
|
|
|
* encoding: US-ASCII
|
|
|
|
* tab size: 8 (not used)
|
|
|
|
* indentation:4
|
|
|
|
*
|
|
|
|
* created on: 2001may25
|
|
|
|
* created by: Markus W. Scherer
|
|
|
|
*
|
|
|
|
* Store Unicode normalization data in a memory-mappable file.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include "unicode/utypes.h"
|
2010-06-04 18:01:08 +00:00
|
|
|
#include "unicode/udata.h"
|
|
|
|
#include "unicode/uset.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "filestrm.h"
|
|
|
|
#include "utrie.h"
|
|
|
|
#include "toolutil.h"
|
|
|
|
#include "writesrc.h"
|
|
|
|
#include "unormimp.h"
|
|
|
|
#include "gennorm.h"
|
2002-05-29 18:36:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
#define DO_DEBUG_OUT 0
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* builder data ------------------------------------------------------------- */
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
static UNewTrie *normTrie;
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
static UToolMemory *normMem, *utf32Mem;
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
static Norm *norms;
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
static USet *compositionExclusions;
|
2002-03-14 23:54:09 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* allocate and initialize a Norm unit */
|
|
|
|
static Norm *
|
|
|
|
allocNorm() {
|
|
|
|
/* allocate Norm */
|
|
|
|
Norm *p=(Norm *)utm_alloc(normMem);
|
|
|
|
return p;
|
2002-03-14 23:54:09 +00:00
|
|
|
}
|
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
extern void
|
|
|
|
init() {
|
|
|
|
uint16_t *p16;
|
2002-11-05 00:56:25 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
|
|
|
|
uprv_memset(normTrie, 0, sizeof(UNewTrie));
|
2002-11-05 00:56:25 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* initialize the two tries */
|
|
|
|
if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
|
|
|
|
fprintf(stderr, "error: failed to initialize tries\n");
|
|
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
|
|
}
|
2002-11-05 00:56:25 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* allocate Norm structures and reset the first one */
|
|
|
|
normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
|
|
|
|
norms=allocNorm();
|
2002-11-05 00:56:25 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* allocate UTF-32 string memory */
|
|
|
|
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
|
2002-11-05 00:56:25 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
compositionExclusions=uset_openEmpty();
|
2002-11-05 00:56:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-06-04 18:01:08 +00:00
|
|
|
* get or create a Norm unit;
|
|
|
|
* get or create the intermediate trie entries for it as well
|
2002-11-05 00:56:25 +00:00
|
|
|
*/
|
2010-06-04 18:01:08 +00:00
|
|
|
static Norm *
|
|
|
|
createNorm(uint32_t code) {
|
|
|
|
Norm *p;
|
|
|
|
uint32_t i;
|
2002-11-05 00:56:25 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
i=utrie_get32(normTrie, (UChar32)code, NULL);
|
|
|
|
if(i!=0) {
|
|
|
|
p=norms+i;
|
2002-11-05 00:56:25 +00:00
|
|
|
} else {
|
2010-06-04 18:01:08 +00:00
|
|
|
/* allocate Norm */
|
|
|
|
p=allocNorm();
|
|
|
|
if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
|
|
|
|
fprintf(stderr, "error: too many normalization entries\n");
|
|
|
|
exit(U_BUFFER_OVERFLOW_ERROR);
|
2001-06-20 22:24:42 +00:00
|
|
|
}
|
|
|
|
}
|
2010-06-04 18:01:08 +00:00
|
|
|
return p;
|
2001-06-20 22:24:42 +00:00
|
|
|
}
|
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* processing incoming normalization data ----------------------------------- */
|
2001-06-20 22:24:42 +00:00
|
|
|
|
2002-03-07 19:49:37 +00:00
|
|
|
/*
|
2010-06-04 18:01:08 +00:00
|
|
|
* process the data for one code point listed in UnicodeData;
|
|
|
|
* UnicodeData itself never maps a code point to both NFD and NFKD
|
2002-03-07 19:49:37 +00:00
|
|
|
*/
|
2001-06-20 22:24:42 +00:00
|
|
|
extern void
|
2010-06-04 18:01:08 +00:00
|
|
|
storeNorm(uint32_t code, Norm *norm) {
|
|
|
|
Norm *p=createNorm(code);
|
2001-06-20 22:24:42 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* store the data */
|
|
|
|
uprv_memcpy(p, norm, sizeof(Norm));
|
2008-10-22 19:50:07 +00:00
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
/* store the decomposition string if there is one here */
|
|
|
|
if(norm->lenNFD!=0) {
|
|
|
|
uint32_t *s32=utm_allocN(utf32Mem, norm->lenNFD);
|
|
|
|
uprv_memcpy(s32, norm->nfd, norm->lenNFD*4);
|
|
|
|
p->nfd=s32;
|
|
|
|
} else if(norm->lenNFKD!=0) {
|
|
|
|
uint32_t *s32=utm_allocN(utf32Mem, norm->lenNFKD);
|
|
|
|
uprv_memcpy(s32, norm->nfkd, norm->lenNFKD*4);
|
|
|
|
p->nfkd=s32;
|
2008-10-22 19:50:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-06-04 18:01:08 +00:00
|
|
|
extern void
|
|
|
|
setCompositionExclusion(uint32_t code) {
|
|
|
|
uset_add(compositionExclusions, (UChar32)code);
|
2008-10-22 19:50:07 +00:00
|
|
|
}
|
|
|
|
|
2003-05-06 01:37:52 +00:00
|
|
|
|
2010-01-06 23:50:03 +00:00
|
|
|
static void
|
|
|
|
writeAllCC(FILE *f) {
|
|
|
|
uint32_t i;
|
|
|
|
UChar32 prevCode, code;
|
|
|
|
uint8_t prevCC, cc;
|
|
|
|
UBool isInBlockZero;
|
|
|
|
|
|
|
|
fprintf(f, "# Canonical_Combining_Class (ccc) values\n");
|
|
|
|
prevCode=0;
|
|
|
|
prevCC=0;
|
|
|
|
for(code=0; code<=0x110000;) {
|
|
|
|
if(code==0x110000) {
|
|
|
|
cc=0;
|
|
|
|
} else {
|
|
|
|
i=utrie_get32(normTrie, code, &isInBlockZero);
|
|
|
|
if(i==0 || isInBlockZero) {
|
|
|
|
cc=0;
|
|
|
|
} else {
|
|
|
|
cc=norms[i].udataCC;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(prevCC!=cc) {
|
|
|
|
if(prevCC!=0) {
|
|
|
|
uint32_t lastCode=code-1;
|
|
|
|
if(prevCode==lastCode) {
|
|
|
|
fprintf(f, "%04lX:%d\n", (long)lastCode, prevCC);
|
|
|
|
} else {
|
|
|
|
fprintf(f, "%04lX..%04lX:%d\n",
|
|
|
|
(long)prevCode, (long)lastCode, prevCC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prevCode=code;
|
|
|
|
prevCC=cc;
|
|
|
|
}
|
|
|
|
if(isInBlockZero) {
|
|
|
|
code+=UTRIE_DATA_BLOCK_LENGTH;
|
|
|
|
} else {
|
|
|
|
++code;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool
|
|
|
|
hasMapping(uint32_t code) {
|
|
|
|
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
|
|
|
|
return norm->lenNFD!=0 || norm->lenNFKD!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool
|
|
|
|
hasOneWayMapping(uint32_t code, UBool withCompat) {
|
|
|
|
for(;;) {
|
|
|
|
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
|
|
|
|
uint8_t length;
|
|
|
|
if((length=norm->lenNFD)!=0) {
|
|
|
|
/*
|
|
|
|
* The canonical decomposition is a one-way mapping if
|
|
|
|
* - it does not map to exactly two code points
|
|
|
|
* - the code has ccc!=0
|
|
|
|
* - the code has the Composition_Exclusion property
|
|
|
|
* - its starter has a one-way mapping (loop for this)
|
|
|
|
* - its non-starter decomposes
|
|
|
|
*/
|
|
|
|
if( length!=2 ||
|
|
|
|
norm->udataCC!=0 ||
|
2010-06-04 18:01:08 +00:00
|
|
|
uset_contains(compositionExclusions, (UChar32)code) ||
|
2010-01-06 23:50:03 +00:00
|
|
|
hasMapping(norm->nfd[1])
|
|
|
|
) {
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
code=norm->nfd[0]; /* continue */
|
|
|
|
} else if(withCompat && norm->lenNFKD!=0) {
|
|
|
|
return TRUE;
|
|
|
|
} else {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
writeAllMappings(FILE *f, UBool withCompat) {
|
|
|
|
uint32_t i, code;
|
|
|
|
UBool isInBlockZero;
|
|
|
|
|
|
|
|
if(withCompat) {
|
|
|
|
fprintf(f, "\n# Canonical and compatibility decomposition mappings\n");
|
|
|
|
} else {
|
|
|
|
fprintf(f, "\n# Canonical decomposition mappings\n");
|
|
|
|
}
|
|
|
|
for(code=0; code<=0x10ffff;) {
|
|
|
|
i=utrie_get32(normTrie, code, &isInBlockZero);
|
|
|
|
if(isInBlockZero) {
|
|
|
|
code+=UTRIE_DATA_BLOCK_LENGTH;
|
|
|
|
} else {
|
|
|
|
if(i!=0) {
|
|
|
|
uint32_t *s32;
|
|
|
|
uint8_t length;
|
|
|
|
char separator;
|
|
|
|
if((length=norms[i].lenNFD)!=0) {
|
|
|
|
s32=norms[i].nfd;
|
|
|
|
separator= hasOneWayMapping(code, withCompat) ? '>' : '=';
|
|
|
|
} else if(withCompat && (length=norms[i].lenNFKD)!=0) {
|
|
|
|
s32=norms[i].nfkd;
|
|
|
|
separator='>';
|
|
|
|
}
|
|
|
|
if(length!=0) {
|
|
|
|
uint8_t j;
|
|
|
|
fprintf(f, "%04lX%c", (long)code, separator);
|
|
|
|
for(j=0; j<length; ++j) {
|
|
|
|
if(j!=0) {
|
|
|
|
fputc(' ', f);
|
|
|
|
}
|
|
|
|
fprintf(f, "%04lX", (long)s32[j]);
|
|
|
|
}
|
|
|
|
fputc('\n', f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
++code;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
writeNorm2TextFile(const char *path, const char *filename, UBool withCompat) {
|
|
|
|
FILE *f=usrc_createTextData(path, filename);
|
|
|
|
if(f==NULL) {
|
|
|
|
exit(U_FILE_ACCESS_ERROR);
|
|
|
|
}
|
|
|
|
writeAllCC(f);
|
|
|
|
writeAllMappings(f, withCompat);
|
|
|
|
fclose(f);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern void
|
|
|
|
writeNorm2(const char *dataDir) {
|
|
|
|
writeNorm2TextFile(dataDir, "nfc.txt", FALSE);
|
|
|
|
writeNorm2TextFile(dataDir, "nfkc.txt", TRUE);
|
|
|
|
}
|
|
|
|
|
2001-07-03 16:45:54 +00:00
|
|
|
extern void
|
|
|
|
cleanUpData(void) {
|
|
|
|
utm_close(normMem);
|
|
|
|
utm_close(utf32Mem);
|
2003-06-18 16:44:22 +00:00
|
|
|
utrie_close(normTrie);
|
|
|
|
uprv_free(normTrie);
|
2010-06-04 18:01:08 +00:00
|
|
|
uset_close(compositionExclusions);
|
2001-07-03 16:45:54 +00:00
|
|
|
}
|
|
|
|
|
2003-05-06 01:37:52 +00:00
|
|
|
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
|
|
|
|
2001-06-20 22:24:42 +00:00
|
|
|
/*
|
|
|
|
* Hey, Emacs, please set the following:
|
|
|
|
*
|
|
|
|
* Local Variables:
|
|
|
|
* indent-tabs-mode: nil
|
|
|
|
* End:
|
|
|
|
*
|
|
|
|
*/
|