1999-12-13 22:25:50 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
2001-03-21 23:22:16 +00:00
|
|
|
* Copyright (C) 1999-2001, International Business Machines
|
1999-12-13 22:25:50 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
* file name: store.c
|
|
|
|
* encoding: US-ASCII
|
|
|
|
* tab size: 8 (not used)
|
|
|
|
* indentation:4
|
|
|
|
*
|
|
|
|
* created on: 1999dec11
|
|
|
|
* created by: Markus W. Scherer
|
|
|
|
*
|
|
|
|
* Store Unicode character properties efficiently for
|
|
|
|
* random access.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
1999-12-28 23:57:50 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uchar.h"
|
1999-12-13 22:25:50 +00:00
|
|
|
#include "cmemory.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
#include "filestrm.h"
|
1999-12-28 23:57:50 +00:00
|
|
|
#include "unicode/udata.h"
|
1999-12-13 22:25:50 +00:00
|
|
|
#include "unewdata.h"
|
|
|
|
#include "genprops.h"
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
#define DO_DEBUG_OUT 0
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
/* Unicode character properties file format ------------------------------------
|
|
|
|
|
|
|
|
The file format prepared and written here contains several data
|
|
|
|
structures that store indexes or data.
|
|
|
|
|
1999-12-17 21:28:21 +00:00
|
|
|
Before the data contents described below, there are the headers required by
|
|
|
|
the udata API for loading ICU data. Especially, a UDataInfo structure
|
|
|
|
precedes the actual data. It contains platform properties values and the
|
|
|
|
file format version.
|
|
|
|
|
2000-12-04 21:02:16 +00:00
|
|
|
The following is a description of format version 1.1 .
|
1999-12-17 21:28:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
Data contents:
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
The contents is a parsed, binary form of several Unicode character
|
2000-04-18 16:56:02 +00:00
|
|
|
database files, most prominently UnicodeData.txt.
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
Any Unicode code point from 0 to 0x10ffff can be looked up to get
|
|
|
|
the properties, if any, for that code point. This means that the input
|
|
|
|
to the lookup are 21-bit unsigned integers, with not all of the
|
|
|
|
21-bit range used.
|
|
|
|
|
|
|
|
It is assumed that client code keeps a uint16_t pointer
|
|
|
|
to the beginning of the data:
|
|
|
|
|
|
|
|
const uint16 *p16;
|
|
|
|
|
|
|
|
Some indexes assume 32-bit units; although client code should only
|
|
|
|
cast the above pointer to (const uint32_t *), it is easier here
|
|
|
|
to talk about the result of the indexing with the definition of
|
|
|
|
another pointer variable for this:
|
|
|
|
|
|
|
|
const uint32_t *p32=(const uint32_t *)p16;
|
|
|
|
|
|
|
|
Formally, the file contains the following structures:
|
|
|
|
|
1999-12-17 21:28:21 +00:00
|
|
|
A0 const uint16_t STAGE_2_BITS(=6);
|
|
|
|
A1 const uint16_t STAGE_3_BITS(=4);
|
|
|
|
(STAGE_1_BITS(=11) not stored, implicitly=21-(STAGE_2_BITS+STAGE_3_BITS))
|
|
|
|
A2 const uint16_t exceptionsIndex; -- 32-bit unit index
|
2000-12-08 01:08:43 +00:00
|
|
|
A3 const uint16_t stage3Index; -- 16-bit unit index of stage3, new in formatVersion 1.1
|
2000-12-04 21:02:16 +00:00
|
|
|
A4 const uint16_t propsIndex; -- 32-bit unit index, new in formatVersion 1.1
|
2000-12-08 01:08:43 +00:00
|
|
|
A5 const uint16_t exceptionsTopIndex; -- 32-bit unit index to the first unit after exceptions units, new in formatVersion 1.1
|
2001-01-31 18:05:19 +00:00
|
|
|
A6 const uint16_t ucharsTopIndex; -- 32-bit unit index to the first unit after the array of UChars for special casing
|
1999-12-15 19:04:11 +00:00
|
|
|
A7 const uint16_t reservedIndex;
|
|
|
|
|
1999-12-16 01:50:39 +00:00
|
|
|
S1 const uint16_t stage1[0x440]; -- 0x440=0x110000>>10
|
2000-12-08 01:08:43 +00:00
|
|
|
S2 const uint16_t stage2[variable size];
|
1999-12-17 21:28:21 +00:00
|
|
|
S3 const uint16_t stage3[variable size];
|
1999-12-15 19:04:11 +00:00
|
|
|
(possible 1*uint16_t for padding to 4-alignment)
|
|
|
|
|
1999-12-17 21:28:21 +00:00
|
|
|
P const uint32_t props32[variable size];
|
2000-12-08 01:08:43 +00:00
|
|
|
E const uint32_t exceptions[variable size];
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
3-stage lookup and properties:
|
|
|
|
|
|
|
|
In order to condense the data for the 21-bit code space, several properties of
|
|
|
|
the Unicode code assignment are exploited:
|
|
|
|
- The code space is sparse.
|
1999-12-15 04:42:56 +00:00
|
|
|
- There are several 10k of consecutive codes with the same properties.
|
1999-12-13 22:25:50 +00:00
|
|
|
- Characters and scripts are allocated in groups of 16 code points.
|
|
|
|
- Inside blocks for scripts the properties are often repetitive.
|
|
|
|
- The 21-bit space is not fully used for Unicode.
|
|
|
|
|
|
|
|
The three-stage lookup organizes code points in groups of 16 in stage 3.
|
1999-12-16 01:50:39 +00:00
|
|
|
64 such groups are grouped again, resulting in blocks of 64 indexes
|
|
|
|
for a total of 1k code points in stage 2.
|
1999-12-13 22:25:50 +00:00
|
|
|
The first stage is limited according to all code points being <0x110000.
|
|
|
|
Each stage contains indexes to groups or blocks of the next stage
|
1999-12-15 04:42:56 +00:00
|
|
|
in an n:1 manner, i.e., multiple entries of one stage may index the same
|
1999-12-13 22:25:50 +00:00
|
|
|
group or block in the next one.
|
1999-12-15 04:42:56 +00:00
|
|
|
In the second and third stages, groups of 64 or 16 may partially or completely
|
|
|
|
overlap to save space with repetitive properties.
|
1999-12-13 22:25:50 +00:00
|
|
|
In the properties table, only unique 32-bit words are stored to exploit
|
|
|
|
non-adjacent overlapping. This is why the third stage does not directly
|
|
|
|
contain the 32-bit properties words but only indexes to them.
|
|
|
|
|
|
|
|
The indexes in each stage take the offset in the data of the next block into
|
|
|
|
account to save additional arithmetic in the access.
|
|
|
|
|
|
|
|
With a given Unicode code point
|
|
|
|
|
|
|
|
uint32_t c;
|
|
|
|
|
|
|
|
and 0<=c<0x110000, the lookup uses the three stage tables to
|
|
|
|
arrive at an index into the props32[] table containing the character
|
|
|
|
properties for c.
|
|
|
|
For some characters, not all of the properties can be efficiently encoded
|
|
|
|
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
|
2000-04-18 16:56:02 +00:00
|
|
|
array.
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
The first stage consumes the 11 most significant bits of the 21-bit code point
|
|
|
|
and results in an index into the second stage:
|
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
uint16_t i2=p16[8+c>>10];
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
The second stage consumes bits 9 to 4 of c and results in an index into the
|
|
|
|
third stage:
|
|
|
|
|
|
|
|
uint16_t i3=p16[i2+((c>>4)&0x3f)];
|
|
|
|
|
|
|
|
The third stage consumes bits 3 to 0 of c and results in a code point-
|
|
|
|
specific value, which itself is only an index into the props32[] table:
|
|
|
|
|
|
|
|
uint16_t i=p16[i3+(c&0xf)];
|
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
Note that the bit numbers and shifts actually depend on the STAGE_2/3_BITS
|
|
|
|
in p16[0..1].
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
There is finally the 32-bit encoded set of properties for c:
|
|
|
|
|
|
|
|
uint32_t props=p32[i];
|
|
|
|
|
|
|
|
For some characters, this contains an index into the exceptions array:
|
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
if(props&EXCEPTION_BIT)) {
|
|
|
|
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
|
1999-12-13 22:25:50 +00:00
|
|
|
...
|
|
|
|
}
|
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
The exception values are a variable number of uint32_t starting at
|
1999-12-13 22:25:50 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
const uint32_t *pe=p32+exceptionsIndex+e;
|
1999-12-13 22:25:50 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
The first uint32_t there contains flags about what values actually follow it.
|
|
|
|
Some of the exception values are UChar32 code points for the case mappings,
|
|
|
|
others are numeric values etc.
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
32-bit properties sets:
|
|
|
|
|
|
|
|
Each 32-bit properties word contains:
|
|
|
|
|
|
|
|
0.. 4 general category
|
|
|
|
5 has exception values
|
2000-04-18 16:56:02 +00:00
|
|
|
6..10 BiDi category
|
|
|
|
11 is mirrored
|
|
|
|
12..19 reserved
|
1999-12-13 22:25:50 +00:00
|
|
|
20..31 value according to bits 0..5:
|
|
|
|
if(has exception) {
|
|
|
|
exception index;
|
|
|
|
} else switch(general category) {
|
|
|
|
case Ll: delta to uppercase; -- same as titlecase
|
2000-05-18 19:10:27 +00:00
|
|
|
case Lu: -delta to lowercase; -- titlecase is same as c
|
|
|
|
case Lt: -delta to lowercase; -- uppercase is same as c
|
2000-04-18 16:56:02 +00:00
|
|
|
case Mn: combining class;
|
2000-05-18 19:10:27 +00:00
|
|
|
case Nd: value=numeric value==decimal digit value=digit value;
|
|
|
|
case Nl:
|
|
|
|
case No: value=numeric value - but decimal digit value and digit value are not defined;
|
2000-04-18 16:56:02 +00:00
|
|
|
default:
|
|
|
|
if(is mirrored) {
|
|
|
|
delta to mirror
|
|
|
|
} else {
|
|
|
|
0
|
|
|
|
};
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
|
1999-12-17 21:28:21 +00:00
|
|
|
Exception values:
|
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
In the first uint32_t exception word for a code point,
|
|
|
|
bits
|
|
|
|
31..24 reserved
|
|
|
|
23..16 combining class
|
|
|
|
15..0 flags that indicate which values follow:
|
1999-12-17 21:28:21 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
bit
|
1999-12-17 21:28:21 +00:00
|
|
|
0 has uppercase mapping
|
|
|
|
1 has lowercase mapping
|
|
|
|
2 has titlecase mapping
|
2000-05-18 19:10:27 +00:00
|
|
|
3 has digit value(s)
|
|
|
|
4 has numeric value (numerator)
|
|
|
|
5 has denominator value
|
|
|
|
6 has a mirror-image Unicode code point
|
2001-01-31 18:05:19 +00:00
|
|
|
7 has SpecialCasing.txt entries
|
2001-02-14 00:45:29 +00:00
|
|
|
8 has CaseFolding.txt entries
|
1999-12-17 21:28:21 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
According to the flags in this word, one or more uint32_t words follow it
|
1999-12-17 21:28:21 +00:00
|
|
|
in the sequence of the bit flags in the flags word; if a flag is not set,
|
|
|
|
then the value is missing or 0:
|
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
For the case mappings and the mirror-image Unicode code point,
|
|
|
|
one uint32_t or UChar32 each is the code point.
|
2000-05-18 19:10:27 +00:00
|
|
|
If the titlecase mapping is missing, then it is the same as the uppercase mapping.
|
|
|
|
|
|
|
|
For the digit values, bits 31..16 contain the decimal digit value, and
|
|
|
|
bits 15..0 contain the digit value. A value of -1 indicates that
|
|
|
|
this value is missing.
|
1999-12-17 21:28:21 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
For the numeric/numerator value, an int32_t word contains the value directly,
|
1999-12-17 21:28:21 +00:00
|
|
|
except for when there is no numerator but a denominator, then the numerator
|
2000-05-18 19:10:27 +00:00
|
|
|
is implicitly 1. This means:
|
|
|
|
numerator denominator result
|
|
|
|
none none none
|
|
|
|
x none x
|
|
|
|
none y 1/y
|
|
|
|
x y x/y
|
1999-12-17 21:28:21 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
For the denominator value, a uint32_t word contains the value directly.
|
1999-12-17 21:28:21 +00:00
|
|
|
|
2001-01-31 18:05:19 +00:00
|
|
|
For special casing mappings, the 32-bit exception word contains:
|
|
|
|
31 if set, this character has complex, conditional mappings
|
|
|
|
that are not stored;
|
|
|
|
otherwise, the mappings are stored according to the following bits
|
|
|
|
30..24 number of UChars used for mappings
|
|
|
|
23..16 reserved
|
|
|
|
15.. 0 UChar offset from the beginning of the UChars array where the
|
|
|
|
UChars for the special case mappings are stored in the following format:
|
|
|
|
|
|
|
|
Format of special casing UChars:
|
|
|
|
One UChar value with lengths as follows:
|
|
|
|
14..10 number of UChars for titlecase mapping
|
|
|
|
9.. 5 number of UChars for uppercase mapping
|
|
|
|
4.. 0 number of UChars for lowercase mapping
|
|
|
|
|
|
|
|
Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
|
|
|
|
|
2001-02-14 00:45:29 +00:00
|
|
|
For case folding mappings, the 32-bit exception word contains:
|
|
|
|
31..24 number of UChars used for the full mapping
|
|
|
|
23..16 reserved
|
|
|
|
15.. 0 UChar offset from the beginning of the UChars array where the
|
|
|
|
UChars for the special case mappings are stored in the following format:
|
|
|
|
|
|
|
|
Format of case folding UChars:
|
|
|
|
Two UChars contain the simple mapping as follows:
|
|
|
|
0, 0 no simple mapping
|
|
|
|
BMP,0 a simple mapping to a BMP code point
|
|
|
|
s1, s2 a simple mapping to a supplementary code point stored as two surrogates
|
|
|
|
This is followed by the UChars for the full case folding mappings.
|
|
|
|
|
1999-12-17 21:28:21 +00:00
|
|
|
Example:
|
|
|
|
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
|
|
|
|
mapping and a numeric value.
|
2000-04-18 16:56:02 +00:00
|
|
|
Its exception values would be stored as 3 uint32_t words:
|
1999-12-17 21:28:21 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
- flags=0x0a (see above) with combining class 0
|
|
|
|
- lowercase mapping 0x2170
|
1999-12-17 21:28:21 +00:00
|
|
|
- numeric value=1
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
----------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
/* UDataInfo cf. udata.h */
|
2000-04-18 16:56:02 +00:00
|
|
|
static UDataInfo dataInfo={
|
1999-12-13 22:25:50 +00:00
|
|
|
sizeof(UDataInfo),
|
|
|
|
0,
|
|
|
|
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
|
|
U_CHARSET_FAMILY,
|
|
|
|
U_SIZEOF_UCHAR,
|
|
|
|
0,
|
|
|
|
|
2001-03-26 21:14:50 +00:00
|
|
|
{0x55, 0x50, 0x72, 0x6f}, /* dataFormat="UPro" */
|
|
|
|
{1, 3, 0, 0}, /* formatVersion */
|
|
|
|
{3, 0, 0, 0} /* dataVersion */
|
1999-12-13 22:25:50 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/* definitions and arrays for the 3-stage lookup */
|
|
|
|
enum {
|
1999-12-15 04:42:56 +00:00
|
|
|
STAGE_2_BITS=6, STAGE_3_BITS=4,
|
|
|
|
STAGE_1_BITS=21-(STAGE_2_BITS+STAGE_3_BITS),
|
|
|
|
|
|
|
|
STAGE_2_SHIFT=STAGE_3_BITS,
|
|
|
|
STAGE_1_SHIFT=(STAGE_2_SHIFT+STAGE_2_BITS),
|
1999-12-13 22:25:50 +00:00
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
/* number of entries per sub-table in each stage */
|
|
|
|
STAGE_1_BLOCK=0x110000>>STAGE_1_SHIFT,
|
1999-12-13 22:25:50 +00:00
|
|
|
STAGE_2_BLOCK=1<<STAGE_2_BITS,
|
|
|
|
STAGE_3_BLOCK=1<<STAGE_3_BITS,
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
/* number of code points per stage 1 index */
|
|
|
|
STAGE_2_3_AREA=1<<STAGE_1_SHIFT,
|
|
|
|
|
|
|
|
MAX_PROPS_COUNT=25000,
|
1999-12-13 22:25:50 +00:00
|
|
|
MAX_UCHAR_COUNT=10000,
|
|
|
|
MAX_EXCEPTIONS_COUNT=4096,
|
1999-12-15 04:42:56 +00:00
|
|
|
MAX_STAGE_2_COUNT=MAX_PROPS_COUNT
|
1999-12-13 22:25:50 +00:00
|
|
|
};
|
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
/* definitions for the properties words */
|
|
|
|
enum {
|
|
|
|
EXCEPTION_SHIFT=5,
|
|
|
|
BIDI_SHIFT,
|
|
|
|
MIRROR_SHIFT=BIDI_SHIFT+5,
|
|
|
|
VALUE_SHIFT=20,
|
|
|
|
|
|
|
|
EXCEPTION_BIT=1UL<<EXCEPTION_SHIFT,
|
2000-04-24 22:31:22 +00:00
|
|
|
VALUE_BITS=32-VALUE_SHIFT
|
2000-04-18 16:56:02 +00:00
|
|
|
};
|
|
|
|
|
2000-04-24 22:31:22 +00:00
|
|
|
static const int32_t MAX_VALUE=(1L<<(VALUE_BITS-1))-1;
|
|
|
|
static const int32_t MIN_VALUE=-(1L<<(VALUE_BITS-1));
|
2000-04-22 18:35:29 +00:00
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
static uint16_t stage1[STAGE_1_BLOCK], stage2[MAX_STAGE_2_COUNT],
|
|
|
|
stage3[MAX_PROPS_COUNT], map[MAX_PROPS_COUNT];
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
/* stage1Top=STAGE_1_BLOCK never changes, stage2Top starts after the empty-properties-group */
|
|
|
|
static uint16_t stage2Top=STAGE_2_BLOCK, stage3Top;
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
/* props[] is used before, props32[] after compacting the array of properties */
|
|
|
|
static uint32_t props[MAX_PROPS_COUNT], props32[MAX_PROPS_COUNT];
|
|
|
|
static uint16_t propsTop=STAGE_3_BLOCK; /* the first props[] are always empty */
|
|
|
|
|
|
|
|
/* exceptions values */
|
2000-04-18 16:56:02 +00:00
|
|
|
static uint32_t exceptions[MAX_EXCEPTIONS_COUNT+20];
|
1999-12-13 22:25:50 +00:00
|
|
|
static uint16_t exceptionsTop=0;
|
|
|
|
|
|
|
|
/* Unicode characters, e.g. for special casing or decomposition */
|
|
|
|
static UChar uchars[MAX_UCHAR_COUNT+20];
|
2001-01-31 18:05:19 +00:00
|
|
|
static uint32_t ucharsTop=0;
|
1999-12-13 22:25:50 +00:00
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
/* statistics */
|
|
|
|
static uint16_t exceptionsCount=0;
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
/* prototypes --------------------------------------------------------------- */
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
static void
|
|
|
|
repeatFromStage2(uint16_t i2, uint32_t start, uint32_t limit, uint16_t i3Repeat, uint32_t x);
|
1999-12-15 04:42:56 +00:00
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
static void
|
2001-03-15 20:38:36 +00:00
|
|
|
repeatFromStage3(uint16_t i3, uint32_t start, uint32_t limit, uint32_t x);
|
1999-12-15 19:04:11 +00:00
|
|
|
|
|
|
|
static uint16_t
|
|
|
|
compactStage(uint16_t *stage, uint16_t stageTop, uint16_t blockSize,
|
|
|
|
uint16_t *parent, uint16_t parentTop);
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
static int
|
|
|
|
compareProps(const void *l, const void *r);
|
|
|
|
|
2000-08-11 00:02:59 +00:00
|
|
|
#if DO_DEBUG_OUT
|
1999-12-15 04:42:56 +00:00
|
|
|
static uint32_t
|
|
|
|
getProps2(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3, uint16_t *pI4);
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
getProps(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3);
|
2001-03-27 22:24:40 +00:00
|
|
|
#endif
|
1999-12-15 04:42:56 +00:00
|
|
|
|
|
|
|
static void
|
|
|
|
setProps(uint32_t c, uint32_t x, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3);
|
|
|
|
|
|
|
|
static uint16_t
|
1999-12-22 22:57:04 +00:00
|
|
|
allocStage2(void);
|
1999-12-15 04:42:56 +00:00
|
|
|
|
|
|
|
static uint16_t
|
1999-12-22 22:57:04 +00:00
|
|
|
allocProps(void);
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2001-01-31 18:05:19 +00:00
|
|
|
static uint32_t
|
|
|
|
addUChars(const UChar *s, uint32_t length);
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
extern void
|
|
|
|
setUnicodeVersion(const char *v) {
|
|
|
|
UVersionInfo version;
|
2001-02-14 00:45:29 +00:00
|
|
|
u_versionFromString(version, v);
|
2000-04-18 16:56:02 +00:00
|
|
|
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
|
|
|
}
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
extern void
|
|
|
|
initStore() {
|
1999-12-28 23:57:50 +00:00
|
|
|
uprv_memset(stage1, 0, sizeof(stage1));
|
|
|
|
uprv_memset(stage2, 0, sizeof(stage2));
|
|
|
|
uprv_memset(stage3, 0, sizeof(stage3));
|
|
|
|
uprv_memset(map, 0, sizeof(map));
|
|
|
|
uprv_memset(props, 0, sizeof(props));
|
|
|
|
uprv_memset(props32, 0, sizeof(props32));
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* store a character's properties ------------------------------------------- */
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
extern uint32_t
|
|
|
|
makeProps(Props *p) {
|
1999-12-15 04:42:56 +00:00
|
|
|
uint32_t x;
|
|
|
|
int32_t value;
|
1999-12-13 22:25:50 +00:00
|
|
|
uint16_t count;
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool isNumber;
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Simple ideas for reducing the number of bits for one character's
|
|
|
|
* properties:
|
|
|
|
*
|
|
|
|
* Some fields are only used for characters of certain
|
|
|
|
* general categories:
|
|
|
|
* - casing fields for letters and others, not for
|
|
|
|
* numbers & Mn
|
|
|
|
* + uppercase not for uppercase letters
|
|
|
|
* + lowercase not for lowercase letters
|
|
|
|
* + titlecase not for titlecase letters
|
|
|
|
*
|
|
|
|
* * most of the time, uppercase=titlecase
|
|
|
|
* - numeric fields for various digit & other types
|
|
|
|
* - canonical combining classes for non-spacing marks (Mn)
|
|
|
|
* * the above is not always true, for all three cases
|
|
|
|
*
|
|
|
|
* Using the same bits for alternate fields saves some space.
|
|
|
|
*
|
1999-12-16 01:50:39 +00:00
|
|
|
* For the canonical categories, there are only few actually used
|
|
|
|
* most of the time.
|
1999-12-13 22:25:50 +00:00
|
|
|
* They can be stored using 5 bits.
|
|
|
|
*
|
|
|
|
* In the BiDi categories, the 5 explicit codes are only ever
|
|
|
|
* assigned 1:1 to 5 well-known code points. Storing only one
|
|
|
|
* value for all "explicit codes" gets this down to 4 bits.
|
|
|
|
* Client code then needs to check for this special value
|
|
|
|
* and replace it by the real one using a 5-element table.
|
|
|
|
*
|
|
|
|
* The general categories Mn & Me, non-spacing & enclosing marks,
|
|
|
|
* are always NSM, and NSM are always of those categories.
|
|
|
|
*
|
|
|
|
* Digit values can often be derived from the code point value
|
|
|
|
* itself in a simple way.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
/* count the case mappings and other values competing for the value bit field */
|
|
|
|
x=0;
|
|
|
|
value=0;
|
1999-12-13 22:25:50 +00:00
|
|
|
count=0;
|
2000-08-11 00:02:59 +00:00
|
|
|
isNumber= (UBool)(genCategoryNames[p->generalCategory][0]=='N');
|
1999-12-15 04:42:56 +00:00
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
if(p->upperCase!=0) {
|
1999-12-15 04:42:56 +00:00
|
|
|
/* verify that no numbers and no Mn have case mappings */
|
2000-05-18 19:10:27 +00:00
|
|
|
if(p->generalCategory==U_LOWERCASE_LETTER) {
|
1999-12-15 04:42:56 +00:00
|
|
|
value=(int32_t)p->code-(int32_t)p->upperCase;
|
|
|
|
} else {
|
2000-04-18 16:56:02 +00:00
|
|
|
x=EXCEPTION_BIT;
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
1999-12-13 22:25:50 +00:00
|
|
|
++count;
|
|
|
|
}
|
|
|
|
if(p->lowerCase!=0) {
|
1999-12-15 04:42:56 +00:00
|
|
|
/* verify that no numbers and no Mn have case mappings */
|
2000-05-18 19:10:27 +00:00
|
|
|
if(p->generalCategory==U_UPPERCASE_LETTER || p->generalCategory==U_TITLECASE_LETTER) {
|
1999-12-15 04:42:56 +00:00
|
|
|
value=(int32_t)p->lowerCase-(int32_t)p->code;
|
|
|
|
} else {
|
2000-04-18 16:56:02 +00:00
|
|
|
x=EXCEPTION_BIT;
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
1999-12-13 22:25:50 +00:00
|
|
|
++count;
|
|
|
|
}
|
|
|
|
if(p->upperCase!=p->titleCase) {
|
2000-05-18 19:10:27 +00:00
|
|
|
x=EXCEPTION_BIT;
|
1999-12-13 22:25:50 +00:00
|
|
|
++count;
|
|
|
|
}
|
1999-12-15 04:42:56 +00:00
|
|
|
if(p->canonicalCombining>0) {
|
|
|
|
/* verify that only Mn has a canonical combining class */
|
2000-05-18 19:10:27 +00:00
|
|
|
if(p->generalCategory==U_NON_SPACING_MARK) {
|
1999-12-15 04:42:56 +00:00
|
|
|
value=p->canonicalCombining;
|
|
|
|
} else {
|
2000-04-18 16:56:02 +00:00
|
|
|
x=EXCEPTION_BIT;
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
++count;
|
|
|
|
}
|
2000-05-18 19:10:27 +00:00
|
|
|
if(p->generalCategory==U_DECIMAL_DIGIT_NUMBER) {
|
|
|
|
/* verify that all numeric fields contain the same value */
|
|
|
|
if(p->decimalDigitValue!=-1 && p->digitValue==p->decimalDigitValue &&
|
|
|
|
p->hasNumericValue && p->numericValue==p->decimalDigitValue &&
|
|
|
|
p->denominator==0
|
|
|
|
) {
|
|
|
|
value=p->decimalDigitValue;
|
|
|
|
} else {
|
|
|
|
x=EXCEPTION_BIT;
|
|
|
|
}
|
|
|
|
++count;
|
|
|
|
} else if(p->generalCategory==U_LETTER_NUMBER || p->generalCategory==U_OTHER_NUMBER) {
|
|
|
|
/* verify that only the numeric value field itself contains a value */
|
|
|
|
if(p->decimalDigitValue==-1 && p->digitValue==-1 && p->hasNumericValue) {
|
1999-12-15 04:42:56 +00:00
|
|
|
value=p->numericValue;
|
|
|
|
} else {
|
2000-04-18 16:56:02 +00:00
|
|
|
x=EXCEPTION_BIT;
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
++count;
|
2000-05-18 19:10:27 +00:00
|
|
|
} else if(p->decimalDigitValue!=-1 || p->digitValue!=-1 || p->hasNumericValue) {
|
|
|
|
/* verify that only numeric categories have numeric values */
|
|
|
|
x=EXCEPTION_BIT;
|
|
|
|
++count;
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
1999-12-15 04:42:56 +00:00
|
|
|
if(p->denominator!=0) {
|
|
|
|
/* verification for numeric category covered by the above */
|
2000-05-18 19:10:27 +00:00
|
|
|
x=EXCEPTION_BIT;
|
1999-12-15 04:42:56 +00:00
|
|
|
++count;
|
|
|
|
}
|
2000-04-18 16:56:02 +00:00
|
|
|
if(p->isMirrored) {
|
|
|
|
if(p->mirrorMapping!=0) {
|
|
|
|
value=(int32_t)p->mirrorMapping-(int32_t)p->code;
|
|
|
|
}
|
|
|
|
++count;
|
|
|
|
}
|
2001-01-31 18:05:19 +00:00
|
|
|
if(p->specialCasing!=NULL) {
|
|
|
|
x=EXCEPTION_BIT;
|
|
|
|
++count;
|
|
|
|
}
|
2001-02-14 00:45:29 +00:00
|
|
|
if(p->caseFolding!=NULL) {
|
|
|
|
x=EXCEPTION_BIT;
|
|
|
|
++count;
|
|
|
|
}
|
1999-12-15 04:42:56 +00:00
|
|
|
|
1999-12-16 01:50:39 +00:00
|
|
|
/* handle exceptions */
|
2000-04-18 16:56:02 +00:00
|
|
|
if(count>1 || x!=0 || value<MIN_VALUE || MAX_VALUE<value) {
|
1999-12-15 04:42:56 +00:00
|
|
|
/* this code point needs exception values */
|
2000-04-24 22:31:22 +00:00
|
|
|
if(beVerbose) {
|
1999-12-15 04:42:56 +00:00
|
|
|
if(x!=0) {
|
|
|
|
printf("*** code 0x%06x needs an exception because it is irregular\n", p->code);
|
|
|
|
} else if(count==1) {
|
2001-03-27 22:24:40 +00:00
|
|
|
printf("*** code 0x%06x needs an exception because its value would be %ld\n",
|
|
|
|
p->code, (long)value);
|
2000-04-22 18:35:29 +00:00
|
|
|
} else if(value<MIN_VALUE || MAX_VALUE<value) {
|
2001-03-27 22:24:40 +00:00
|
|
|
printf("*** code 0x%06x needs an exception because its value is out-of-bounds at %ld (not [%ld..%ld]\n",
|
|
|
|
p->code, (long)value, (long)MIN_VALUE, (long)MAX_VALUE);
|
1999-12-15 04:42:56 +00:00
|
|
|
} else {
|
|
|
|
printf("*** code 0x%06x needs an exception because it has %u values\n", p->code, count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
++exceptionsCount;
|
2000-04-18 16:56:02 +00:00
|
|
|
x=EXCEPTION_BIT;
|
|
|
|
|
|
|
|
/* allocate and create exception values */
|
|
|
|
value=exceptionsTop;
|
|
|
|
if(value>=4096) {
|
2000-04-22 18:35:29 +00:00
|
|
|
fprintf(stderr, "genprops: out of exceptions memory at U+%06x. (%d exceeds allocated space)\n",
|
|
|
|
p->code, value);
|
2000-04-18 16:56:02 +00:00
|
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
|
|
} else {
|
|
|
|
uint32_t first=(uint32_t)p->canonicalCombining<<16;
|
|
|
|
uint16_t length=1;
|
1999-12-13 22:25:50 +00:00
|
|
|
|
2000-04-18 16:56:02 +00:00
|
|
|
if(p->upperCase!=0) {
|
|
|
|
first|=1;
|
|
|
|
exceptions[value+length++]=p->upperCase;
|
|
|
|
}
|
|
|
|
if(p->lowerCase!=0) {
|
|
|
|
first|=2;
|
|
|
|
exceptions[value+length++]=p->lowerCase;
|
|
|
|
}
|
|
|
|
if(p->upperCase!=p->titleCase) {
|
|
|
|
first|=4;
|
2000-05-18 19:10:27 +00:00
|
|
|
if(p->titleCase!=0) {
|
|
|
|
exceptions[value+length++]=p->titleCase;
|
|
|
|
} else {
|
|
|
|
exceptions[value+length++]=p->code;
|
2000-04-18 16:56:02 +00:00
|
|
|
}
|
2000-05-18 19:10:27 +00:00
|
|
|
}
|
|
|
|
if(p->decimalDigitValue!=-1 || p->digitValue!=-1) {
|
|
|
|
first|=8;
|
|
|
|
exceptions[value+length++]=
|
|
|
|
(uint32_t)p->decimalDigitValue<<16|
|
|
|
|
(uint16_t)p->digitValue;
|
|
|
|
}
|
|
|
|
if(p->hasNumericValue) {
|
|
|
|
if(p->denominator==0) {
|
|
|
|
first|=0x10;
|
|
|
|
exceptions[value+length++]=(uint32_t)p->numericValue;
|
|
|
|
} else {
|
|
|
|
if(p->numericValue!=1) {
|
|
|
|
first|=0x10;
|
|
|
|
exceptions[value+length++]=(uint32_t)p->numericValue;
|
|
|
|
}
|
|
|
|
first|=0x20;
|
|
|
|
exceptions[value+length++]=p->denominator;
|
2000-04-18 16:56:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if(p->isMirrored) {
|
2000-05-18 19:10:27 +00:00
|
|
|
first|=0x40;
|
2000-04-18 16:56:02 +00:00
|
|
|
exceptions[value+length++]=p->mirrorMapping;
|
|
|
|
}
|
2001-01-31 18:05:19 +00:00
|
|
|
if(p->specialCasing!=NULL) {
|
2001-02-14 00:45:29 +00:00
|
|
|
first|=0x80;
|
2001-01-31 18:05:19 +00:00
|
|
|
if(p->specialCasing->isComplex) {
|
|
|
|
/* complex special casing */
|
|
|
|
exceptions[value+length++]=0x80000000;
|
|
|
|
} else {
|
|
|
|
/* unconditional special casing */
|
|
|
|
UChar u[128];
|
|
|
|
uint32_t i;
|
|
|
|
uint16_t j, entry;
|
|
|
|
|
|
|
|
i=1;
|
|
|
|
entry=0;
|
|
|
|
j=p->specialCasing->lowerCase[0];
|
|
|
|
if(j>0) {
|
|
|
|
uprv_memcpy(u+1, p->specialCasing->lowerCase+1, 2*j);
|
|
|
|
i+=j;
|
|
|
|
entry=j;
|
|
|
|
}
|
|
|
|
j=p->specialCasing->upperCase[0];
|
|
|
|
if(j>0) {
|
|
|
|
uprv_memcpy(u+i, p->specialCasing->upperCase+1, 2*j);
|
|
|
|
i+=j;
|
|
|
|
entry|=j<<5;
|
|
|
|
}
|
|
|
|
j=p->specialCasing->titleCase[0];
|
|
|
|
if(j>0) {
|
|
|
|
uprv_memcpy(u+i, p->specialCasing->titleCase+1, 2*j);
|
|
|
|
i+=j;
|
|
|
|
entry|=j<<10;
|
|
|
|
}
|
|
|
|
u[0]=entry;
|
2000-04-18 16:56:02 +00:00
|
|
|
|
2001-01-31 18:05:19 +00:00
|
|
|
exceptions[value+length++]=(i<<24)|addUChars(u, i);
|
|
|
|
}
|
|
|
|
}
|
2001-02-14 00:45:29 +00:00
|
|
|
if(p->caseFolding!=NULL) {
|
|
|
|
first|=0x100;
|
|
|
|
if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
|
|
|
|
/* special case folding, store only a marker */
|
|
|
|
exceptions[value+length++]=0;
|
|
|
|
} else {
|
|
|
|
/* normal case folding with a simple and a full mapping */
|
|
|
|
UChar u[128];
|
|
|
|
uint16_t i;
|
|
|
|
|
|
|
|
/* store the simple mapping into the first two UChars */
|
|
|
|
i=0;
|
|
|
|
u[1]=0;
|
|
|
|
UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple);
|
|
|
|
|
|
|
|
/* store the full mapping after that */
|
|
|
|
i=p->caseFolding->full[0];
|
|
|
|
if(i>0) {
|
|
|
|
uprv_memcpy(u+2, p->caseFolding->full+1, 2*i);
|
|
|
|
}
|
|
|
|
|
|
|
|
exceptions[value+length++]=(i<<24)|addUChars(u, 2+i);
|
|
|
|
}
|
|
|
|
}
|
2000-04-18 16:56:02 +00:00
|
|
|
exceptions[value]=first;
|
|
|
|
exceptionsTop+=length;
|
|
|
|
}
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
|
1999-12-16 01:50:39 +00:00
|
|
|
/* put together the 32-bit word of encoded properties */
|
1999-12-15 04:42:56 +00:00
|
|
|
x|=
|
2000-04-18 16:56:02 +00:00
|
|
|
(uint32_t)p->generalCategory |
|
|
|
|
(uint32_t)p->bidi<<BIDI_SHIFT |
|
|
|
|
(uint32_t)p->isMirrored<<MIRROR_SHIFT |
|
|
|
|
(uint32_t)value<<VALUE_SHIFT;
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2000-04-24 22:31:22 +00:00
|
|
|
if(beVerbose && p->code<=0x9f) {
|
|
|
|
if(p->code==0) {
|
|
|
|
printf("static uint32_t staticProps32Table[0xa0]={\n");
|
|
|
|
}
|
|
|
|
if(x&EXCEPTION_BIT) {
|
2001-01-31 18:05:19 +00:00
|
|
|
/* ### TODO: do something more intelligent if there is an exception */
|
2001-03-27 22:24:40 +00:00
|
|
|
printf(" /* 0x%02lx */ 0x%lx, /* has exception */\n",
|
|
|
|
(unsigned long)p->code, (unsigned long)x&~EXCEPTION_BIT);
|
2000-04-24 22:31:22 +00:00
|
|
|
} else {
|
2001-03-27 22:24:40 +00:00
|
|
|
printf(" /* 0x%02lx */ 0x%lx,\n",
|
|
|
|
(unsigned long)p->code, (unsigned long)x);
|
2000-04-24 22:31:22 +00:00
|
|
|
}
|
|
|
|
if(p->code==0x9f) {
|
|
|
|
printf("};\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
return x;
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
/*
|
1999-12-16 01:50:39 +00:00
|
|
|
* "Higher-hanging fruit" (not implemented):
|
|
|
|
*
|
1999-12-13 22:25:50 +00:00
|
|
|
* For some sets of fields, there are fewer sets of values
|
|
|
|
* than the product of the numbers of values per field.
|
|
|
|
* This means that storing one single value for more than
|
|
|
|
* one field and later looking up both field values in a table
|
|
|
|
* saves space.
|
|
|
|
* Examples:
|
|
|
|
* - general category & BiDi
|
|
|
|
*
|
|
|
|
* There are only few common displacements between a code point
|
|
|
|
* and its case mappings. Store deltas. Store codes for few
|
|
|
|
* occuring deltas.
|
|
|
|
*/
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
extern void
|
|
|
|
addProps(uint32_t c, uint32_t x) {
|
|
|
|
uint16_t notUsed;
|
|
|
|
|
|
|
|
setProps(c, x, ¬Used, ¬Used, ¬Used);
|
|
|
|
}
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
/* areas of same properties ------------------------------------------------- */
|
|
|
|
|
|
|
|
extern void
|
2001-03-15 20:38:36 +00:00
|
|
|
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
|
1999-12-15 04:42:56 +00:00
|
|
|
/*
|
|
|
|
* Set the repetitive properties for the big, known areas of all the same
|
|
|
|
* character properties. Most of those will share the same stage 2 and 3
|
|
|
|
* tables.
|
|
|
|
*
|
|
|
|
* Assumptions:
|
|
|
|
* - each area starts at a code point that is a multiple of 16
|
|
|
|
* - there may be some properties already stored for some code points,
|
|
|
|
* especially in the Private Use areas
|
|
|
|
*/
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
uint16_t i1, i2, j3, i1Limit, i2Repeat, i3Repeat;
|
|
|
|
uint32_t start, next, limit;
|
1999-12-15 04:42:56 +00:00
|
|
|
|
|
|
|
/* fill in the repetitive properties */
|
2001-03-15 20:38:36 +00:00
|
|
|
start=first;
|
|
|
|
limit=last+1;
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
/* allocate a stage 3 block and set all of its properties to x */
|
|
|
|
i3Repeat=allocProps();
|
|
|
|
for(j3=0; j3<STAGE_3_BLOCK; ++j3) {
|
|
|
|
props[i3Repeat+j3]=x;
|
|
|
|
}
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
/* we will need to allocate a stage 2 block if we use an entire one at all */
|
|
|
|
i2Repeat=0;
|
1999-12-15 19:04:11 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
i1=(uint16_t)(start>>STAGE_1_SHIFT);
|
|
|
|
i1Limit=(uint16_t)(limit>>STAGE_1_SHIFT);
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
/*
|
|
|
|
* now there are up to three sub-areas:
|
|
|
|
* - a range of code points before the first full block for
|
|
|
|
* one stage 1 index
|
|
|
|
* - a (big) range of code points within full blocks for
|
|
|
|
* stage 1 indexes
|
|
|
|
* - a range of code points after the last full block for
|
|
|
|
* one stage 1 index
|
|
|
|
*/
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
if((start&(STAGE_2_3_AREA-1))!=0) {
|
|
|
|
/* incomplete stage 2 block at the beginning */
|
|
|
|
/* allocate the stage 2 block if necessary */
|
|
|
|
i2=stage1[i1];
|
|
|
|
if(i2==0) {
|
|
|
|
stage1[i1]=i2=allocStage2();
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
/* fill stages 2 & 3 of this sub-area */
|
|
|
|
if(i1<i1Limit) {
|
|
|
|
/* the stage 2 block goes to the end */
|
|
|
|
next=(i1+1)<<STAGE_1_SHIFT;
|
|
|
|
repeatFromStage2(i2, start, next, i3Repeat, x);
|
|
|
|
start=next;
|
1999-12-15 04:42:56 +00:00
|
|
|
|
|
|
|
/* advance i1 to the first full block */
|
|
|
|
++i1;
|
|
|
|
} else {
|
2001-03-15 20:38:36 +00:00
|
|
|
/* there is only one stage 2 block at all */
|
|
|
|
repeatFromStage2(i2, start, limit, i3Repeat, x);
|
|
|
|
return;
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
2001-03-15 20:38:36 +00:00
|
|
|
}
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
while(i1<i1Limit) {
|
|
|
|
/* fill complete stage 2 blocks */
|
|
|
|
next=start+STAGE_2_3_AREA;
|
|
|
|
i2=stage1[i1];
|
|
|
|
if(i2==0) {
|
|
|
|
/* set the index for common repeat block for stage 2 */
|
|
|
|
if(i2Repeat==0) {
|
|
|
|
/* allocate and fill a stage 2 block for this */
|
|
|
|
uint16_t j2;
|
|
|
|
|
|
|
|
i2Repeat=allocStage2();
|
|
|
|
for(j2=0; j2<STAGE_2_BLOCK; ++j2) {
|
|
|
|
stage2[i2Repeat+j2]=i3Repeat;
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
}
|
2001-03-15 20:38:36 +00:00
|
|
|
stage1[i1]=i2Repeat;
|
|
|
|
} else {
|
|
|
|
repeatFromStage2(i2, start, next, i3Repeat, x);
|
|
|
|
}
|
|
|
|
start=next;
|
|
|
|
++i1;
|
|
|
|
}
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
if(start<limit) {
|
|
|
|
/* fill the area after the last full block */
|
|
|
|
i2=stage1[i1];
|
|
|
|
if(i2==0) {
|
|
|
|
stage1[i1]=i2=allocStage2();
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
2001-03-15 20:38:36 +00:00
|
|
|
|
|
|
|
repeatFromStage2(i2, start, limit, i3Repeat, x);
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
/* set a section of a stage 2 table and its properties to x */
|
|
|
|
static void
|
|
|
|
repeatFromStage2(uint16_t i2, uint32_t start, uint32_t limit, uint16_t i3Repeat, uint32_t x) {
|
|
|
|
uint32_t next;
|
|
|
|
uint16_t i2Limit, i3;
|
|
|
|
|
|
|
|
/* remove irrelevant bits from start and limit */
|
|
|
|
start&=STAGE_2_3_AREA-1;
|
|
|
|
limit=((limit-1)&(STAGE_2_3_AREA-1))+1;
|
|
|
|
|
2001-03-26 21:14:50 +00:00
|
|
|
i2Limit=(uint16_t)(i2+(limit>>STAGE_3_BITS));
|
2001-03-15 20:38:36 +00:00
|
|
|
i2+=(uint16_t)(start>>STAGE_3_BITS);
|
|
|
|
|
|
|
|
/* similar to repeatProps(), there may be 3 sub-areas */
|
|
|
|
if((start&(STAGE_3_BLOCK-1))!=0) {
|
|
|
|
/* incomplete stage 3 block at the beginning */
|
|
|
|
i3=stage2[i2];
|
|
|
|
if(i3==0) {
|
|
|
|
stage2[i2]=i3=allocProps();
|
|
|
|
}
|
|
|
|
|
|
|
|
if(i2<i2Limit) {
|
|
|
|
/* the stage 3 block goes to the end */
|
|
|
|
next=(i2+1)<<STAGE_3_BITS;
|
|
|
|
repeatFromStage3(i3, start, next, x);
|
|
|
|
start=next;
|
|
|
|
++i2;
|
|
|
|
} else {
|
|
|
|
/* there is only one stage 3 block at all */
|
|
|
|
repeatFromStage3(i3, start, limit, x);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
1999-12-15 04:42:56 +00:00
|
|
|
|
|
|
|
while(i2<i2Limit) {
|
2001-03-15 20:38:36 +00:00
|
|
|
/* fill complete stage 3 blocks */
|
|
|
|
next=start+STAGE_3_BLOCK;
|
1999-12-15 04:42:56 +00:00
|
|
|
i3=stage2[i2];
|
|
|
|
if(i3==0) {
|
|
|
|
stage2[i2]=i3Repeat;
|
|
|
|
} else {
|
2001-03-15 20:38:36 +00:00
|
|
|
repeatFromStage3(i3, start, next, x);
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
2001-03-15 20:38:36 +00:00
|
|
|
start=next;
|
1999-12-15 04:42:56 +00:00
|
|
|
++i2;
|
|
|
|
}
|
1999-12-13 22:25:50 +00:00
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
if(start<limit) {
|
|
|
|
i3=stage2[i2];
|
1999-12-15 19:04:11 +00:00
|
|
|
if(i3==0) {
|
|
|
|
stage2[i2]=i3=allocProps();
|
|
|
|
}
|
|
|
|
|
2001-03-15 20:38:36 +00:00
|
|
|
repeatFromStage3(i3, start, limit, x);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
repeatFromStage3(uint16_t i3, uint32_t start, uint32_t limit, uint32_t x) {
|
|
|
|
uint16_t i3End;
|
|
|
|
|
|
|
|
i3End=(uint16_t)(i3+((limit-1)&(STAGE_3_BLOCK-1)));
|
|
|
|
i3+=(uint16_t)(start&(STAGE_3_BLOCK-1));
|
|
|
|
|
|
|
|
while(i3<=i3End) {
|
1999-12-15 19:04:11 +00:00
|
|
|
/* some properties may be set in this stage 3 block */
|
2001-03-15 20:38:36 +00:00
|
|
|
if(props[i3]==0) {
|
|
|
|
props[i3]=x;
|
|
|
|
}
|
|
|
|
++i3;
|
1999-12-15 19:04:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
/* compacting --------------------------------------------------------------- */
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
extern void
|
1999-12-22 22:57:04 +00:00
|
|
|
compactStage2(void) {
|
1999-12-15 19:04:11 +00:00
|
|
|
uint16_t newTop=compactStage(stage2, stage2Top, STAGE_2_BLOCK, stage1, STAGE_1_BLOCK);
|
1999-12-15 04:42:56 +00:00
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
/* we saved some space */
|
|
|
|
if(beVerbose) {
|
|
|
|
printf("compactStage2() reduced stage2Top from %u to %u\n", stage2Top, newTop);
|
|
|
|
}
|
|
|
|
stage2Top=newTop;
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2000-08-11 00:02:59 +00:00
|
|
|
#if DO_DEBUG_OUT
|
|
|
|
{
|
2000-04-24 22:31:22 +00:00
|
|
|
/* debug output */
|
1999-12-15 19:04:11 +00:00
|
|
|
uint16_t i1, i2, i3, i4;
|
|
|
|
uint32_t c;
|
|
|
|
for(c=0; c<0xffff; c+=307) {
|
|
|
|
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
}
|
2000-08-11 00:02:59 +00:00
|
|
|
#endif
|
1999-12-15 19:04:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
extern void
|
1999-12-22 22:57:04 +00:00
|
|
|
compactStage3(void) {
|
1999-12-15 19:04:11 +00:00
|
|
|
uint16_t newTop=compactStage(stage3, stage3Top, STAGE_3_BLOCK, stage2, stage2Top);
|
1999-12-15 04:42:56 +00:00
|
|
|
|
|
|
|
/* we saved some space */
|
|
|
|
if(beVerbose) {
|
1999-12-15 19:04:11 +00:00
|
|
|
printf("compactStage3() reduced stage3Top from %u to %u\n", stage3Top, newTop);
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
1999-12-15 19:04:11 +00:00
|
|
|
stage3Top=newTop;
|
1999-12-15 04:42:56 +00:00
|
|
|
|
2000-08-11 00:02:59 +00:00
|
|
|
#if DO_DEBUG_OUT
|
|
|
|
{
|
2000-04-24 22:31:22 +00:00
|
|
|
/* debug output */
|
1999-12-15 04:42:56 +00:00
|
|
|
uint16_t i1, i2, i3, i4;
|
|
|
|
uint32_t c;
|
|
|
|
for(c=0; c<0xffff; c+=307) {
|
|
|
|
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
|
|
|
|
}
|
|
|
|
}
|
2000-08-11 00:02:59 +00:00
|
|
|
#endif
|
1999-12-15 04:42:56 +00:00
|
|
|
}
|
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
static uint16_t
|
|
|
|
compactStage(uint16_t *stage, uint16_t stageTop, uint16_t blockSize,
|
|
|
|
uint16_t *parent, uint16_t parentTop) {
|
1999-12-13 22:25:50 +00:00
|
|
|
/*
|
1999-12-15 19:04:11 +00:00
|
|
|
* This function is the common implementation for compacting
|
|
|
|
* a stage table.
|
|
|
|
* There are stageTop entries (indexes) in stage[].
|
|
|
|
* stageTop is a multiple of blockSize, and there are always blockSize stage[] entries
|
|
|
|
* per parent stage entry which do not overlap - yet.
|
|
|
|
* The first blockSize stage[] entries are always the empty ones.
|
|
|
|
* We make the blocks overlap appropriately here and fill every blockSize-th entry in
|
1999-12-13 22:25:50 +00:00
|
|
|
* map[] with the mapping from old to new properties indexes
|
1999-12-15 19:04:11 +00:00
|
|
|
* in order to adjust the parent stage tables.
|
1999-12-13 22:25:50 +00:00
|
|
|
* This simple algorithm does not find arbitrary overlaps, but only those
|
1999-12-15 19:04:11 +00:00
|
|
|
* where the last i entries of the previous block and the first i of the
|
1999-12-13 22:25:50 +00:00
|
|
|
* current one all have the same value.
|
|
|
|
* This seems reasonable and yields linear performance.
|
|
|
|
*/
|
1999-12-15 04:42:56 +00:00
|
|
|
uint16_t i, start, prevEnd, newStart, x;
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
map[0]=0;
|
1999-12-15 19:04:11 +00:00
|
|
|
newStart=blockSize;
|
|
|
|
for(start=newStart; start<stageTop;) {
|
2000-08-11 00:02:59 +00:00
|
|
|
prevEnd=(uint16_t)(newStart-1);
|
1999-12-15 19:04:11 +00:00
|
|
|
x=stage[start];
|
|
|
|
if(x==stage[prevEnd]) {
|
1999-12-13 22:25:50 +00:00
|
|
|
/* overlap by at least one */
|
1999-12-15 19:04:11 +00:00
|
|
|
for(i=1; i<blockSize && x==stage[start+i] && x==stage[prevEnd-i]; ++i) {}
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
/* overlap by i */
|
2000-08-11 00:02:59 +00:00
|
|
|
map[start]=(uint16_t)(newStart-i);
|
1999-12-13 22:25:50 +00:00
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
/* move the non-overlapping indexes to their new positions */
|
1999-12-13 22:25:50 +00:00
|
|
|
start+=i;
|
2000-08-11 00:02:59 +00:00
|
|
|
for(i=(uint16_t)(blockSize-i); i>0; --i) {
|
1999-12-15 19:04:11 +00:00
|
|
|
stage[newStart++]=stage[start++];
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
} else if(newStart<start) {
|
1999-12-15 19:04:11 +00:00
|
|
|
/* move the indexes to their new positions */
|
1999-12-13 22:25:50 +00:00
|
|
|
map[start]=newStart;
|
1999-12-15 19:04:11 +00:00
|
|
|
for(i=blockSize; i>0; --i) {
|
|
|
|
stage[newStart++]=stage[start++];
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
} else /* no overlap && newStart==start */ {
|
|
|
|
map[start]=start;
|
1999-12-15 19:04:11 +00:00
|
|
|
newStart+=blockSize;
|
1999-12-13 22:25:50 +00:00
|
|
|
start=newStart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
/* now adjust the parent stage table */
|
|
|
|
for(i=0; i<parentTop; ++i) {
|
|
|
|
parent[i]=map[parent[i]];
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
/* we saved some space */
|
2000-08-11 00:02:59 +00:00
|
|
|
return (uint16_t)(stageTop-(start-newStart));
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
extern void
|
1999-12-22 22:57:04 +00:00
|
|
|
compactProps(void) {
|
1999-12-13 22:25:50 +00:00
|
|
|
/*
|
|
|
|
* At this point, all the propsTop properties are in props[], but they
|
|
|
|
* are not all unique.
|
|
|
|
* Now we sort them, reduce them to unique ones in props32[], and
|
|
|
|
* build an index in stage3[] from the old to the new indexes.
|
|
|
|
* (The quick sort averages at N*log(N) with N=propsTop. The inverting
|
|
|
|
* yields linear performance.)
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are going to sort only an index table in map[] because we need this
|
|
|
|
* index table anyway and qsort() does not allow to sort two tables together
|
|
|
|
* directly. This will thus also reduce the amount of data moved around.
|
|
|
|
*/
|
|
|
|
uint16_t i, oldIndex, newIndex;
|
|
|
|
uint32_t x;
|
2000-08-11 00:02:59 +00:00
|
|
|
#if DO_DEBUG_OUT
|
|
|
|
{
|
2000-04-24 22:31:22 +00:00
|
|
|
/* debug output */
|
1999-12-15 04:42:56 +00:00
|
|
|
uint16_t i1, i2, i3;
|
|
|
|
uint32_t c;
|
|
|
|
for(c=0; c<0xffff; c+=307) {
|
|
|
|
printf("properties(0x%06x)=0x%06x\n", c, getProps(c, &i1, &i2, &i3));
|
|
|
|
}
|
|
|
|
}
|
2000-08-11 00:02:59 +00:00
|
|
|
#endif
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
/* build the index table */
|
|
|
|
for(i=propsTop; i>0;) {
|
|
|
|
--i;
|
|
|
|
map[i]=i;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* do not reorder the first, empty entries */
|
|
|
|
qsort(map+STAGE_3_BLOCK, propsTop-STAGE_3_BLOCK, 2, compareProps);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now invert the reordered table and compact it in the same step.
|
|
|
|
* The result will be props32[] having only unique properties words
|
|
|
|
* and stage3[] having indexes to them.
|
|
|
|
*/
|
|
|
|
newIndex=0;
|
|
|
|
for(i=0; i<propsTop;) {
|
|
|
|
/* set the first of a possible series of the same properties */
|
|
|
|
oldIndex=map[i];
|
|
|
|
props32[newIndex]=x=props[oldIndex];
|
|
|
|
stage3[oldIndex]=newIndex;
|
|
|
|
|
|
|
|
/* set the following same properties only in stage3 */
|
|
|
|
while(++i<propsTop && x==props[map[i]]) {
|
|
|
|
stage3[map[i]]=newIndex;
|
|
|
|
}
|
|
|
|
|
|
|
|
++newIndex;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* we saved some space */
|
|
|
|
stage3Top=propsTop;
|
|
|
|
propsTop=newIndex;
|
|
|
|
if(beVerbose) {
|
|
|
|
printf("compactProps() reduced propsTop from %u to %u\n", stage3Top, propsTop);
|
|
|
|
}
|
2000-08-11 00:02:59 +00:00
|
|
|
#if DO_DEBUG_OUT
|
|
|
|
{
|
2000-04-24 22:31:22 +00:00
|
|
|
/* debug output */
|
1999-12-15 04:42:56 +00:00
|
|
|
uint16_t i1, i2, i3, i4;
|
|
|
|
uint32_t c;
|
|
|
|
for(c=0; c<0xffff; c+=307) {
|
|
|
|
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
|
|
|
|
}
|
|
|
|
}
|
2000-08-11 00:02:59 +00:00
|
|
|
#endif
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
compareProps(const void *l, const void *r) {
|
|
|
|
uint32_t left=props[*(const uint16_t *)l], right=props[*(const uint16_t *)r];
|
|
|
|
|
|
|
|
/* compare general categories first */
|
|
|
|
int rc=(int)(left&0x1f)-(int)(right&0x1f);
|
|
|
|
if(rc==0 && left!=right) {
|
|
|
|
rc= left<right ? -1 : 1;
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* generate output data ----------------------------------------------------- */
|
|
|
|
|
|
|
|
extern void
|
2000-02-29 18:42:28 +00:00
|
|
|
generateData(const char *dataDir) {
|
1999-12-16 01:50:39 +00:00
|
|
|
static uint16_t indexes[8]={
|
1999-12-17 21:28:21 +00:00
|
|
|
STAGE_2_BITS, STAGE_3_BITS,
|
1999-12-16 01:50:39 +00:00
|
|
|
0, 0,
|
1999-12-17 21:28:21 +00:00
|
|
|
0, 0, 0, 0
|
1999-12-16 01:50:39 +00:00
|
|
|
};
|
|
|
|
|
1999-12-13 22:25:50 +00:00
|
|
|
UNewDataMemory *pData;
|
|
|
|
UErrorCode errorCode=U_ZERO_ERROR;
|
|
|
|
uint32_t size;
|
|
|
|
long dataLength;
|
1999-12-15 19:04:11 +00:00
|
|
|
uint16_t i, offset;
|
|
|
|
|
|
|
|
/* fix up the indexes in the stage tables to include the table offsets in the data */
|
2000-12-04 21:02:16 +00:00
|
|
|
offset=8+STAGE_1_BLOCK; /* uint16_t offset to stage2[] */
|
1999-12-15 19:04:11 +00:00
|
|
|
for(i=0; i<STAGE_1_BLOCK; ++i) {
|
|
|
|
stage1[i]+=offset;
|
|
|
|
}
|
|
|
|
|
2000-12-04 21:02:16 +00:00
|
|
|
offset+=stage2Top; /* uint16_t offset to stage3[] */
|
2000-12-08 01:08:43 +00:00
|
|
|
indexes[3]=offset;
|
1999-12-15 19:04:11 +00:00
|
|
|
for(i=0; i<stage2Top; ++i) {
|
|
|
|
stage2[i]+=offset;
|
|
|
|
}
|
1999-12-13 22:25:50 +00:00
|
|
|
|
2000-08-11 00:02:59 +00:00
|
|
|
offset=(uint16_t)((offset+stage3Top+1)/2); /* uint32_t offset to props[], include padding */
|
2000-12-04 21:02:16 +00:00
|
|
|
indexes[4]=offset; /* uint32_t offset to props[] */
|
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
for(i=0; i<stage3Top; ++i) {
|
|
|
|
stage3[i]+=offset;
|
|
|
|
}
|
|
|
|
|
2000-12-04 21:02:16 +00:00
|
|
|
offset+=propsTop;
|
|
|
|
indexes[2]=offset; /* uint32_t offset to exceptions[] */
|
1999-12-15 19:04:11 +00:00
|
|
|
|
2000-12-08 01:08:43 +00:00
|
|
|
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
|
|
|
|
indexes[5]=offset;
|
2001-01-31 18:05:19 +00:00
|
|
|
|
|
|
|
ucharsTop=(ucharsTop+1)&~1;
|
|
|
|
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
|
|
|
|
indexes[6]=offset;
|
2000-12-08 01:08:43 +00:00
|
|
|
size=4*offset; /* total size of data */
|
1999-12-15 19:04:11 +00:00
|
|
|
|
|
|
|
if(beVerbose) {
|
2001-03-27 22:24:40 +00:00
|
|
|
printf("number of stage 2 entries: %5u\n", stage2Top);
|
|
|
|
printf("number of stage 3 entries: %5u\n", stage3Top);
|
|
|
|
printf("number of unique properties values: %5u\n", propsTop);
|
|
|
|
printf("number of code points with exceptions: %5u\n", exceptionsCount);
|
|
|
|
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
|
|
|
|
printf("data size: %6lu\n", (unsigned long)size);
|
1999-12-15 19:04:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* write the data */
|
2000-03-04 01:19:19 +00:00
|
|
|
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
|
1999-12-13 22:25:50 +00:00
|
|
|
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "genprops: unable to create data memory, error %d\n", errorCode);
|
|
|
|
exit(errorCode);
|
|
|
|
}
|
|
|
|
|
1999-12-15 19:04:11 +00:00
|
|
|
udata_writeBlock(pData, indexes, sizeof(indexes));
|
|
|
|
udata_writeBlock(pData, stage1, sizeof(stage1));
|
|
|
|
udata_writeBlock(pData, stage2, 2*stage2Top);
|
|
|
|
udata_writeBlock(pData, stage3, 2*stage3Top);
|
2000-10-10 22:08:57 +00:00
|
|
|
udata_writePadding(pData, 2*((stage2Top+stage3Top)&1));
|
1999-12-15 19:04:11 +00:00
|
|
|
udata_writeBlock(pData, props32, 4*propsTop);
|
2000-04-18 16:56:02 +00:00
|
|
|
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
|
2001-01-31 18:05:19 +00:00
|
|
|
udata_writeBlock(pData, uchars, 2*ucharsTop);
|
1999-12-13 22:25:50 +00:00
|
|
|
|
|
|
|
/* finish up */
|
|
|
|
dataLength=udata_finish(pData, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
|
|
|
|
exit(errorCode);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(dataLength!=(long)size) {
|
2001-03-27 22:24:40 +00:00
|
|
|
fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
|
|
|
|
dataLength, (unsigned long)size);
|
1999-12-13 22:25:50 +00:00
|
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* helpers ------------------------------------------------------------------ */
|
|
|
|
|
1999-12-15 04:42:56 +00:00
|
|
|
/* get properties after compacting them */
|
2000-08-11 00:02:59 +00:00
|
|
|
#if DO_DEBUG_OUT
|
1999-12-15 04:42:56 +00:00
|
|
|
static uint32_t
|
|
|
|
getProps2(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3, uint16_t *pI4) {
|
|
|
|
uint16_t i1, i2, i3, i4;
|
|
|
|
|
|
|
|
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
|
2000-08-11 00:02:59 +00:00
|
|
|
*pI2=i2=(uint16_t)(stage1[i1]+((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1)));
|
|
|
|
*pI3=i3=(uint16_t)(stage2[i2]+(c&(STAGE_3_BLOCK-1)));
|
1999-12-15 04:42:56 +00:00
|
|
|
*pI4=i4=stage3[i3];
|
|
|
|
return props32[i4];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get properties before compacting them */
|
|
|
|
static uint32_t
|
|
|
|
getProps(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3) {
|
|
|
|
uint16_t i1, i2, i3;
|
|
|
|
|
|
|
|
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
|
2000-08-11 00:02:59 +00:00
|
|
|
*pI2=i2=(uint16_t)(stage1[i1]+((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1)));
|
|
|
|
*pI3=i3=(uint16_t)(stage2[i2]+(c&(STAGE_3_BLOCK-1)));
|
1999-12-15 04:42:56 +00:00
|
|
|
return props[i3];
|
|
|
|
}
|
2001-03-26 21:14:50 +00:00
|
|
|
#endif
|
1999-12-15 04:42:56 +00:00
|
|
|
|
|
|
|
/* set properties before compacting them */
|
|
|
|
static void
|
|
|
|
setProps(uint32_t c, uint32_t x, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3) {
|
|
|
|
uint16_t i1, i2, i3;
|
|
|
|
|
|
|
|
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
|
|
|
|
|
|
|
|
i2=stage1[i1];
|
|
|
|
if(i2==0) {
|
|
|
|
stage1[i1]=i2=allocStage2();
|
|
|
|
}
|
|
|
|
*pI2=i2+=(uint16_t)((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1));
|
|
|
|
|
|
|
|
i3=stage2[i2];
|
|
|
|
if(i3==0) {
|
|
|
|
stage2[i2]=i3=allocProps();
|
|
|
|
}
|
|
|
|
*pI3=i3+=(uint16_t)(c&(STAGE_3_BLOCK-1));
|
|
|
|
|
|
|
|
props[i3]=x;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint16_t
|
1999-12-22 22:57:04 +00:00
|
|
|
allocStage2(void) {
|
1999-12-15 04:42:56 +00:00
|
|
|
uint16_t i=stage2Top;
|
|
|
|
stage2Top+=STAGE_2_BLOCK;
|
|
|
|
if(stage2Top>=MAX_STAGE_2_COUNT) {
|
|
|
|
fprintf(stderr, "genprops: stage 2 overflow\n");
|
|
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint16_t
|
1999-12-22 22:57:04 +00:00
|
|
|
allocProps(void) {
|
1999-12-15 04:42:56 +00:00
|
|
|
uint16_t i=propsTop;
|
|
|
|
propsTop+=STAGE_3_BLOCK;
|
|
|
|
if(propsTop>=MAX_PROPS_COUNT) {
|
|
|
|
fprintf(stderr, "genprops: properties overflow\n");
|
|
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
2001-01-31 18:05:19 +00:00
|
|
|
static uint32_t
|
|
|
|
addUChars(const UChar *s, uint32_t length) {
|
|
|
|
uint32_t top=(uint16_t)(ucharsTop+length);
|
1999-12-13 22:25:50 +00:00
|
|
|
UChar *p;
|
|
|
|
|
|
|
|
if(top>=MAX_UCHAR_COUNT) {
|
|
|
|
fprintf(stderr, "genprops: out of UChars memory\n");
|
|
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
|
|
}
|
|
|
|
p=uchars+ucharsTop;
|
2001-01-31 18:05:19 +00:00
|
|
|
uprv_memcpy(p, s, 2*length);
|
1999-12-13 22:25:50 +00:00
|
|
|
ucharsTop=top;
|
2001-01-31 18:05:19 +00:00
|
|
|
return (uint32_t)(p-uchars);
|
1999-12-13 22:25:50 +00:00
|
|
|
}
|
2000-02-29 18:42:28 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Hey, Emacs, please set the following:
|
|
|
|
*
|
|
|
|
* Local Variables:
|
|
|
|
* indent-tabs-mode: nil
|
|
|
|
* End:
|
|
|
|
*
|
|
|
|
*/
|