2001-03-17 00:46:46 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
2003-06-03 20:58:22 +00:00
|
|
|
* Copyright (C) 2001-2003, International Business Machines
|
2001-03-17 00:46:46 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
* file name: ucol_tok.h
|
|
|
|
* encoding: US-ASCII
|
|
|
|
* tab size: 8 (not used)
|
|
|
|
* indentation:4
|
|
|
|
*
|
|
|
|
* created 02/22/2001
|
|
|
|
* created by: Vladimir Weinstein
|
|
|
|
*
|
|
|
|
* This module reads a tailoring rule string and produces a list of
|
|
|
|
* tokens that will be turned into collation elements
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2001-01-29 22:09:24 +00:00
|
|
|
#ifndef UCOL_TOKENS_H
|
|
|
|
#define UCOL_TOKENS_H
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#include "unicode/utypes.h"
|
2002-10-16 22:34:16 +00:00
|
|
|
#include "unicode/uset.h"
|
2002-09-20 01:54:48 +00:00
|
|
|
|
|
|
|
#if !UCONFIG_NO_COLLATION
|
|
|
|
|
2001-03-08 17:40:42 +00:00
|
|
|
#include "ucol_imp.h"
|
2001-06-06 20:21:48 +00:00
|
|
|
#include "uhash.h"
|
2001-08-16 23:38:09 +00:00
|
|
|
#include "unicode/parseerr.h"
|
2001-01-29 22:09:24 +00:00
|
|
|
|
2001-03-20 07:22:33 +00:00
|
|
|
#define UCOL_TOK_UNSET 0xFFFFFFFF
|
|
|
|
#define UCOL_TOK_RESET 0xDEADBEEF
|
|
|
|
|
2001-01-31 23:12:37 +00:00
|
|
|
#define UCOL_TOK_POLARITY_NEGATIVE 0
|
|
|
|
#define UCOL_TOK_POLARITY_POSITIVE 1
|
2001-01-29 22:09:24 +00:00
|
|
|
|
2001-05-02 05:05:06 +00:00
|
|
|
#define UCOL_TOK_TOP 0x04
|
|
|
|
#define UCOL_TOK_VARIABLE_TOP 0x08
|
|
|
|
#define UCOL_TOK_BEFORE 0x03
|
|
|
|
#define UCOL_TOK_SUCCESS 0x10
|
|
|
|
|
2001-03-14 00:12:46 +00:00
|
|
|
/* this is space for the extra strings that need to be unquoted */
|
|
|
|
/* during the parsing of the rules */
|
2001-09-27 23:18:14 +00:00
|
|
|
#define UCOL_TOK_EXTRA_RULE_SPACE_SIZE 2048
|
2001-01-31 23:12:37 +00:00
|
|
|
typedef struct UColToken UColToken;
|
2001-01-29 22:09:24 +00:00
|
|
|
|
2001-01-31 07:20:56 +00:00
|
|
|
typedef struct {
|
2001-06-06 20:21:48 +00:00
|
|
|
UColToken* first;
|
|
|
|
UColToken* last;
|
2001-01-29 22:09:24 +00:00
|
|
|
UColToken* reset;
|
2002-03-15 23:41:56 +00:00
|
|
|
UBool indirect;
|
2001-01-29 22:09:24 +00:00
|
|
|
uint32_t baseCE;
|
2001-02-08 01:04:43 +00:00
|
|
|
uint32_t baseContCE;
|
2001-01-29 22:09:24 +00:00
|
|
|
uint32_t nextCE;
|
2001-02-08 01:04:43 +00:00
|
|
|
uint32_t nextContCE;
|
2001-01-29 22:09:24 +00:00
|
|
|
uint32_t previousCE;
|
2001-02-08 01:04:43 +00:00
|
|
|
uint32_t previousContCE;
|
2001-02-21 17:45:06 +00:00
|
|
|
int32_t pos[UCOL_STRENGTH_LIMIT];
|
|
|
|
uint32_t gapsLo[3*UCOL_CE_STRENGTH_LIMIT];
|
|
|
|
uint32_t gapsHi[3*UCOL_CE_STRENGTH_LIMIT];
|
|
|
|
uint32_t numStr[UCOL_CE_STRENGTH_LIMIT];
|
|
|
|
UColToken* fStrToken[UCOL_CE_STRENGTH_LIMIT];
|
|
|
|
UColToken* lStrToken[UCOL_CE_STRENGTH_LIMIT];
|
2001-01-31 07:20:56 +00:00
|
|
|
} UColTokListHeader;
|
|
|
|
|
|
|
|
struct UColToken {
|
2001-01-31 21:10:55 +00:00
|
|
|
UChar debugSource;
|
|
|
|
UChar debugExpansion;
|
2001-09-27 23:18:14 +00:00
|
|
|
UChar debugPrefix;
|
2001-02-26 10:28:56 +00:00
|
|
|
uint32_t CEs[128];
|
|
|
|
uint32_t noOfCEs;
|
|
|
|
uint32_t expCEs[128];
|
|
|
|
uint32_t noOfExpCEs;
|
2001-01-31 07:20:56 +00:00
|
|
|
uint32_t source;
|
|
|
|
uint32_t expansion;
|
2001-09-27 23:18:14 +00:00
|
|
|
uint32_t prefix;
|
2001-01-31 23:12:37 +00:00
|
|
|
uint32_t strength;
|
2001-02-09 01:04:08 +00:00
|
|
|
uint32_t toInsert;
|
2001-01-31 23:12:37 +00:00
|
|
|
uint32_t polarity; /* 1 for <, <<, <<<, , ; and -1 for >, >>, >>> */
|
2001-01-31 07:20:56 +00:00
|
|
|
UColTokListHeader *listHeader;
|
2001-01-31 21:10:55 +00:00
|
|
|
UColToken* previous;
|
|
|
|
UColToken* next;
|
2001-10-17 22:54:58 +00:00
|
|
|
UChar *rulesToParse;
|
2001-01-29 22:09:24 +00:00
|
|
|
};
|
|
|
|
|
2002-03-11 23:52:03 +00:00
|
|
|
/*
|
|
|
|
* This is a token that has been parsed
|
|
|
|
* but not yet processed. Used to reduce
|
|
|
|
* the number of arguments in the parser
|
|
|
|
*/
|
2001-01-31 07:20:56 +00:00
|
|
|
typedef struct {
|
2002-03-07 18:59:00 +00:00
|
|
|
uint32_t strength;
|
|
|
|
uint32_t charsOffset;
|
|
|
|
uint32_t charsLen;
|
|
|
|
uint32_t extensionOffset;
|
|
|
|
uint32_t extensionLen;
|
|
|
|
uint32_t prefixOffset;
|
|
|
|
uint32_t prefixLen;
|
2002-03-15 23:41:56 +00:00
|
|
|
uint16_t flags;
|
|
|
|
uint16_t indirectIndex;
|
2002-03-07 18:59:00 +00:00
|
|
|
} UColParsedToken;
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
UColParsedToken parsedToken;
|
2001-03-14 00:12:46 +00:00
|
|
|
UChar *source;
|
|
|
|
UChar *end;
|
2002-10-16 22:34:16 +00:00
|
|
|
const UChar *current;
|
2001-03-14 00:12:46 +00:00
|
|
|
UChar *sourceCurrent;
|
|
|
|
UChar *extraCurrent;
|
|
|
|
UChar *extraEnd;
|
2003-04-30 00:49:01 +00:00
|
|
|
const InverseUCATableHeader *invUCA;
|
2001-01-31 07:20:56 +00:00
|
|
|
const UCollator *UCA;
|
2001-06-06 20:21:48 +00:00
|
|
|
UHashtable *tailored;
|
2001-03-30 00:23:46 +00:00
|
|
|
UColOptionSet *opts;
|
2001-01-31 07:20:56 +00:00
|
|
|
uint32_t resultLen;
|
2002-07-02 22:32:14 +00:00
|
|
|
uint32_t listCapacity;
|
2001-01-31 07:20:56 +00:00
|
|
|
UColTokListHeader *lh;
|
2001-04-19 17:08:07 +00:00
|
|
|
UColToken *varTop;
|
2002-10-16 22:34:16 +00:00
|
|
|
USet *copySet;
|
|
|
|
USet *removeSet;
|
2001-01-31 07:20:56 +00:00
|
|
|
} UColTokenParser;
|
|
|
|
|
2001-03-07 19:43:06 +00:00
|
|
|
typedef struct {
|
|
|
|
const UChar *subName;
|
|
|
|
int32_t subLen;
|
|
|
|
UColAttributeValue attrVal;
|
|
|
|
} ucolTokSuboption;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
const UChar *optionName;
|
|
|
|
int32_t optionLen;
|
2001-10-11 23:54:55 +00:00
|
|
|
const ucolTokSuboption *subopts;
|
2001-03-07 19:43:06 +00:00
|
|
|
int32_t subSize;
|
|
|
|
UColAttribute attr;
|
|
|
|
} ucolTokOption;
|
2001-01-31 07:20:56 +00:00
|
|
|
|
|
|
|
#define ucol_tok_isSpecialChar(ch) \
|
2001-02-05 05:36:12 +00:00
|
|
|
(((((ch) <= 0x002F) && ((ch) >= 0x0020)) || \
|
2001-01-31 07:20:56 +00:00
|
|
|
(((ch) <= 0x003F) && ((ch) >= 0x003A)) || \
|
|
|
|
(((ch) <= 0x0060) && ((ch) >= 0x005B)) || \
|
2001-09-27 23:18:14 +00:00
|
|
|
(((ch) <= 0x007E) && ((ch) >= 0x007D)) || \
|
|
|
|
(ch) == 0x007B))
|
2001-01-29 22:09:24 +00:00
|
|
|
|
|
|
|
|
2001-10-22 05:30:22 +00:00
|
|
|
U_CFUNC
|
|
|
|
uint32_t ucol_tok_assembleTokenList(UColTokenParser *src,
|
|
|
|
UParseError *parseError,
|
|
|
|
UErrorCode *status);
|
2001-03-22 21:16:20 +00:00
|
|
|
|
2001-10-22 05:30:22 +00:00
|
|
|
U_CFUNC
|
2001-06-06 20:21:48 +00:00
|
|
|
void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, UCollator *UCA, UErrorCode *status);
|
2001-10-22 05:30:22 +00:00
|
|
|
|
|
|
|
U_CFUNC void ucol_tok_closeTokenList(UColTokenParser *src);
|
|
|
|
|
|
|
|
U_CAPI const UChar* U_EXPORT2 ucol_tok_parseNextToken(UColTokenParser *src,
|
2001-03-20 00:56:37 +00:00
|
|
|
UBool startOfRules,
|
2001-08-16 00:55:16 +00:00
|
|
|
UParseError *parseError,
|
2001-03-20 00:56:37 +00:00
|
|
|
UErrorCode *status);
|
2001-06-27 22:37:31 +00:00
|
|
|
|
2002-04-30 23:29:44 +00:00
|
|
|
U_CAPI const UChar * U_EXPORT2
|
|
|
|
ucol_tok_getNextArgument(const UChar *start, const UChar *end,
|
|
|
|
UColAttribute *attrib, UColAttributeValue *value,
|
|
|
|
UErrorCode *status);
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#endif /* #if !UCONFIG_NO_COLLATION */
|
2001-10-20 01:09:31 +00:00
|
|
|
|
2001-01-29 22:09:24 +00:00
|
|
|
#endif
|