scuffed-code/icu4c/source/test/intltest/rndmcoll.cpp

1633 lines
51 KiB
C++
Raw Normal View History

/*
******************************************************************************
* Copyright (C) 2005-2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include "rndmcoll.h"
#if !UCONFIG_NO_COLLATION
//Raymond: Following comments are copied from Java implementation
//
// each rule can be:
// "[" command "]"
// "& [" position "]"
// "&" before chars
// relation "[variable top]"
// relation (chars "|")? chars ("/" chars)?
// plus, a reset must come before a relation
//Raymond: The grammar of "collation rule" can be defined use a modified-BNF.
// We need a tool to
// 1. Parse the defination and
// 2. Build an active object which can generate concrete collation rules
//
//Rammond:
// The difference between standarad BNF and our modified-BNF is
// 1. Alternation item can has a "weight" now
// 2. Accept "? weight" as a new operation -- short form altheration
// 3. Accept "range" as a new operation -- repeat
// We do not accept any EBNF grammar in our modified-BNF.
//
// Furthermore, the grammar of our modified-BNF itself can be defined using standard BNF
// NOTE: Following characters are treated as literal in the definition
// { } ? $ % , - ;
//
// string =
// alphabeta =
// digit =
// integer = integer digit | digit
// var = var alphabet | var digit | $ alphabet
//
// var-defs = var-defs var-def | var-def
// var-def = var '=' defination;
//
// defination = simple | repeat | short-alt | sequence | alternation1 | alternation2
// defination = alternation1 | alternation2
//
// simple = var | string | '(' defination ')'
// repeat = simple range
// short-alt = simple ? | simple ? weight
//
// item = simple | repeat | shor-alt
// sequence = sequence item | item item
//
// item1 = sequence
// alternation1 = alternation1 '|' item1 | item1 '|' item1
//
// item2 = simple weight
// alternation2 = alternation2 '|' item2 | item2
//
// range = { integer , integer }
// weight = integer %
//
// Special-characters:
// (sapce) contact operation, or separators to increase readability
// = definition
// | selection operation
// ( ) precedence select
// ' ' override special-character to plain character
//
/////////////////////////////////////////
// Completeness vs. Magic:
// The modified-BNF definition of "collation rule" need not be complete.
// It means following assertion is do acceptable:
// o Some variables are undefined. or
// o We cannot get a "collation rule" according the modified-BNF definition.
// Let's explain:
//
// Our target is to build an active object which can generate concrete collation rules.
//
// In order to formalize the generating process, we used modified-BNF to describe it.
// Then, the parser will help us to build an complex active object from basic active objects.
//
// It's acceptable that some basice active object is defined outside the definition and magically injected into.
//
// The magic power is got via empty variable defination. After parser pasing the definition,
// we get a part-defined active object, then we inject some magic active objects to
// change the prat-defined active object to a complete active object.
//
// Following are copied from Java implementation with less modification.
static const char collationBNF[] =
"$s = ' '? 50%;"
"$crlf = '\r\n';"
"$alternateOptions = non'-'ignorable | shifted;"
"$onoff = on | off;"
"$caseFirstOptions = off | upper | lower;"
"$strengthOptions = '1' | '2' | '3' | '4' | 'I';"
"$commandList = '['"
" ( alternate ' ' $alternateOptions"
" | backwards' 2'"
" | normalization ' ' $onoff "
" | caseLevel ' ' $onoff "
" | hiraganaQ ' ' $onoff"
" | caseFirst ' ' $caseFirstOptions"
" | strength ' ' $strengthOptions"
" ) ']';"
"$command = $commandList $crlf;"
"$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;"
"$allTypes = variable | regular | implicit | trailing | $ignorableTypes;"
"$positionList = '[' (first | last) ' ' $allTypes ']';"
"$beforeList = '[before ' ('1' | '2' | '3') ']';"
"$relationList = ("
" '<'"
" | '<<'"
" | ';'"
" | '<<<'"
" | ','"
" | '='"
");"
"$string = $magic;"
"$rel1 = '[variable top]' $s;"
"$p1 = ($string $s '|' $s)? 25%;"
"$p2 = ('/' $s $string $s)? 25%;"
"$rel2 = $p1 $string $s $p2;"
"$relation = $relationList $s ($rel1 | $rel2) $crlf;"
"$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;"
"$mostRules = $command 1% | $reset 5% | $relation 25%;"
"$root = $command{0,5} $reset $mostRules{1,20};";
// Document of class LiteralToEscape
//
// ATTENTION:
// From http://icu.sourceforge.net/userguide/Collate_Customization.html.
// We get the precedence of escape/quote operations
//
// (highest) 1. backslash \
// 2. two single quotes ''
// 3. quoting ' '
//
// ICU Collation should accept following as the same string.
//
// 1) 'ab'c _
// 2) a\bc \
// 3) a'b'\c |- They are equal.
// 4) abc _/
//
// From "two single quotes", we have following deductions
// D1. empty quoting is illgal. (obviously)
// D2. no contact operation between two quotings
// '.''.' is not .. it is .'.
// D3. "two single quotes" cannot contact two quoting simultaneously
// '..''''.' is not ..'. it is ..''.
// NOTICE:
// "two single quotes" can contact before one quoting
// '''.' is '.
// "two single quotes" can literally contact after one quoting
// But, from syntax, it's one quoting including a "two single quotes"
// '.''' is .'
// D4. "two single quotes" cannot solely be included in quoting
// '''' is not ' it is ''
// NOTICE: These are legal
// '.''.' is .'.
// '.''' is .'
//
// dicision
// /\
// /__\
// output buffer input buffer
//
// To make our dicision (within an atom operation) without caring input and output buffer,
// following calling pattern (within an atom operation) shall be avoided
//
// P1 open_quoting() then close_quoting() (direct violation) D1
// P2 close_quoting() then open_quoting() (direct violation) D2
// P3 empty open_quoting() (indirect violation) D1, D4
// P4 empty close_quoting() (indirect violation) D2, D3
// P5 open_quoting() then two single quotes (indirect violation) D4
// P6 close_quoting() then two single quotes (indirect violation) D3
//
// two single quotes escaping will not open_ or close_ quoting()
// The choice will not lose some quoing forms.
//
// For open_quoting(),
// we may get this form quoting ''' P5
// It may raise a bug ''''x
// If we expect
// '''.' let the next char open the quoting
// '.''.' the quoting is already opened by preceding char
//
// For close_quoting()
// we will get this form quoting '.''' P6
// It may raise a bug '.''''.'
// If we expect
// '.'''\. let the next char close the quoting
// '.''''.' the expectation is wrong! using '.'\''.' instead
//
// It's a hard work to readjust generation opportunity for various escaping form.
// We just simply ignore it.
static const char DIGIT_CHAR[] = "0123456789";
static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0};
static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){
const char * p = list;
for (;*p != 0 && *p != c; p++);
return *p?TRUE:FALSE;
}
static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);}
static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);}
static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);}
static inline UBool isSpecialAsciiChar(char c) {
return (c >= 0x0021 && c <= 0x007E &&
!((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
(c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
(c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/)));
}
// Utility class, can be treated as an auto expanded array. no boundary check.
class Buffer_byte{
typedef char byte;
byte * start;
byte * current;
int buffer_size; // size unit is byte
inline void expand(int add_size = 100){ // size unit is byte
int new_size = buffer_size + add_size;
int cs_snap = content_size();
start = (byte *) realloc(start, new_size); // may change the value of start
current = start + cs_snap;
memset(current, 0, add_size);
buffer_size = new_size;
}
inline void expand_to(int size){
int r = size - buffer_size;
if (r > 0) {
expand(r); // simply expand, no block alignment
}
}
public:
Buffer_byte():start(NULL),current(start),buffer_size(0){
expand();
}
~Buffer_byte(){
free(start);
}
int content_size(){return current - start;} // size unit is byte
inline void reset(){
start != NULL ? memset(start, 0, buffer_size) : 0;
current = start;
}
// Using memory copy method to append a C array to buffer,
inline void append(const void * c, int size){ // size unit is byte
expand_to(content_size() + size) ;
memcpy(current, c, size);
current = current + size;
}
void * operator &(){
return start;
}
};
//template<typename type>
// class BUFFER{
#define BUFFER(type, name)\
class name {\
private:\
Buffer_byte buf;\
public:\
void reset() {buf.reset();}\
void append(type c) {buf.append(&c, sizeof(type));}\
void append_array(const type * p, int size) {buf.append(p, sizeof(type)*size);}\
type * operator &(){return (type *) &buf;}\
type & operator [] (int i) { return operator&()[i];}\
operator type *(){return operator&();}\
int content_size(){return buf.content_size() / sizeof(type);}\
};
class Node;
//typedef BUFFER<char> Buffer_char;
//typedef BUFFER<int> Buffer_int;
//typedef BUFFER<Node *> Buffer_pNode;
BUFFER(char, Buffer_char);
BUFFER(int, Buffer_int);
BUFFER(Node *, Buffer_pNode);
/* Helper class
* Encoding a string literal to a valid collation escaping string.
* See documents in anonymous.design
*/
class LiteralToEscape{
public:
// Return a null-terminate c-string. The buffer is owned by callee.
char * operator()(const char * literal /*c-string*/){
str.reset();
for(;*literal != 0; literal++){
append(*literal);
}
close_quoting(); // P4 exception, to close whole quoting
return str;
}
enum CHOICE {YES, NO, RAND};
enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC};
LiteralToEscape(CHOICE escape_literal = RAND,
CHOICE two_quotes_escape = RAND,
ESCAPE_FORM escape_form = RAND_ESC):
escape_literal(escape_literal),
two_quotes_escape(two_quotes_escape),
escape_form(escape_form),
is_quoting(FALSE){}
private:
Buffer_char str;
class Bool{ // assigned or random value
public:
operator UBool() { // conversion operator
if (tag == RAND){
return rand()%2 == 1;
} else {
return tag == YES ? TRUE : FALSE;
}
}
Bool(CHOICE flag=RAND):tag(flag){}
private:
CHOICE tag;
};
ESCAPE_FORM escape_form;
UBool quote_escape;
UBool bslash_escape;
Bool escape_literal;
Bool two_quotes_escape;
void set_options(){
ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form;
switch (t){
case BSLASH_ONLY :
bslash_escape = TRUE; quote_escape = FALSE; break;
case QUOTE_ONLY:
bslash_escape = FALSE;quote_escape = TRUE; break;
case QUOTE_AND_BSLAH:
bslash_escape = TRUE; quote_escape = TRUE; break;
default:
;// error
}
}
// str [in] null-terminated c-string
void append(const char * str){
for(;*str != 0; str++){
append(*str);
}
}
inline void append(const char c){
set_options();
if (c == '\\'){
quote_escape ? open_quoting() : close_quoting();
//bslash_escape always true here
str.append('\\');
str.append('\\');
} else if (c == '\''){
if (two_quotes_escape){ // quoted using two single quotes
// See documents in anonymous.design
str.append('\'');
str.append('\'');
} else{
quote_escape ? open_quoting() : close_quoting();
//bslash_escape always true here
str.append('\\');
str.append('\'');
}
} else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){
quote_escape ? open_quoting() : close_quoting();
if (bslash_escape) str.append('\\');
str.append(c);
} else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal
if (escape_literal){
quote_escape ? open_quoting() : close_quoting();
if (bslash_escape) str.append('\\');
str.append(c);
} else {
close_quoting();
str.append(c);
}
}
}
void reset(){
str.reset();
is_quoting = FALSE;
}
UBool is_quoting;
inline void open_quoting(){
if(is_quoting){
// do nothing
} else {
str.append('\'');
is_quoting = TRUE;
}
}
inline void close_quoting(){
if(is_quoting){
str.append('\'');
is_quoting = FALSE;
} else {
// do nothing
}
}
};
enum TokenType {STRING, VAR, NUMBER, WEIGHT, STREAM_END, ERROR, QUESTION_MARK,RANG_START,RANG_END, LPAR, RPAR, SEMI, EQ, COMMA, BAR};
/* A simple complier scanner to get token from source string.
*
* The result is put in this->tokenBuffer
* The buffer is owned by Scanner, and will be destoried in next call for getNextToken()
*/
class Scanner{
public:
// source [in] null-terminated c-string
Scanner(const char *const source/*c-string*/):source(source), working(source), history(source){
}
char tokenBuffer[50]; //null terminated c-string. LIMITATION & ASSUMPTION here
TokenType tokenType;
/* this->working [in]
* this->tokenBuffer [out]
* this->tokenType [out]
*/
TokenType getNextToken(){
history = working;
p_b = tokenBuffer; // for simplicity, no buffer overflow will be checked
tokenType = ERROR;
StateType state = START;
while (state != DONE){
char c = *working++;
switch(state){
case START:
if (isWhiteSpace(c)){
// do nothing, skip
} else if (isDigit(c)){
*p_b++ = c; // no overflow check
state = IN_NUM;
} else if (isAlphabet(c)){
*p_b++ = c; // no overflow check
state = IN_STRING;
} else if (c == '$'){
*p_b++ = c; // no overflow check
state = IN_VAR;
} else if (c == '\''){
state = IN_QUOTE;
} else if (c == '\\'){
state = IN_BSLASH;
} else if (c == 0){
tokenType = STREAM_END;
state = DONE;
working--;
} else{
switch(c){
case '?': tokenType = QUESTION_MARK; break;
case '{': tokenType = RANG_START; break;
case '}': tokenType = RANG_END; break;
case '(': tokenType = LPAR; break;
case ')': tokenType = RPAR; break;
case ';': tokenType = SEMI; break;
case '=': tokenType = EQ; break;
case ',': tokenType = COMMA; break;
case '|': tokenType = BAR; break;
default: tokenType = ERROR;
}
//Raymond: Can we gracefully remove the unnecessary test?
// == Can we write a more beautiful 'switch' statement?
if (tokenType == ERROR){
working--;
*p_b = 0;
} else {
*p_b++ = c; // tokenBuffer[0], no overflow check
*p_b++ = 0; // tokenBuffer[1], no overflow check
}
state = DONE;
}
break;//START
case IN_NUM:
if (isDigit(c)){
*p_b++ = c; // no overflow check
} else if (c == '%'){ // no blank space between NUMBER and % symbol
*p_b++ = c;
*p_b = 0;
tokenType = WEIGHT;
state = DONE;
} else {
working--; // reset working point to current character
tokenType = NUMBER;
*p_b = 0;
state = DONE;
}
break;//IN_NUM
case IN_VAR:
if (isAlphabet(c) || isDigit(c)){ // For simplicity, digit can be the leading char
*p_b++ = c; // no overflow check
} else {
working--;
*p_b = 0;
tokenType = VAR;
state = DONE;
}
break;//IN_VAR
case IN_STRING:
if (c == '\''){
state = IN_QUOTE;
} else if (c =='\\'){ // NOTE: escaping for C language syntax here
state = IN_BSLASH;
} else if (isAlphabet(c) || isDigit(c)){
*p_b++ = c; // no overflow check
} else{
working--;
*p_b = 0;
tokenType = STRING;
state = DONE;
}
break;//IN_STRING
case IN_QUOTE:
if (c == '\''){
state = IN_STRING; // Yes, IN_STRING
} else {
*p_b++ = c; // no tokenBuffer overflow check !!!
}
break;//IN_QUOTE
case IN_BSLASH:
if (c == 'n') {
*p_b++ = '\n'; // no tokenBuffer overflow check
} else if (c == 'r'){
*p_b++ = '\r'; // no tokenBuffer overflow check
} else if (c == 't'){
*p_b++ = '\t'; // no tokenBuffer overflow check
} else if (c == '\''){ // NOTE: escaping for C language syntax here
*p_b++ = '\''; // no tokenBuffer overflow check
} else {
working--;
}
state = IN_STRING; // Yes, IN_STRING
break;//IN_BSLASH
case DONE: /* should never happen */
default:
working--;
*p_b = 0;
tokenType = ERROR;
state = DONE;
break;
}//switch(state)
}//while (state != DONE)
return tokenType;
}
inline UBool ungetToken(){
working = history;
}
inline void dumpCurrentPoint(){
printf("\n______________________________________________________________________________\n");
fwrite(source, history - source, 1, stdout);
printf("\n=====current token=====\n");
fwrite(history, working - history, 1,stdout);
printf("\n>>>>>current point>>>>>\n");
//printf(working); // This function will consume some characters, for example %
int len = strlen(working);
fwrite(working, len, 1, stdout);
printf("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
}
private:
const char *const source;
const char * history;
const char * working;
char * p_b;
enum StateType {START, IN_NUM, IN_VAR, IN_QUOTE, IN_BSLASH, IN_STRING, DONE};
};//class Scanner
class Node{
public:
// Return a null-terminated c-string. The buffer is owned by callee.
virtual const char* getTargetString() = 0;
virtual ~Node(){};
};
/* Helper class.
* It's a mapping table between 'variable name' and its 'active Node object'
*/
class NodeSymbolTable{
public:
UBool is_var_exist(const char *const var_name /*c-string*/){
return get_var_name_index(var_name) == -1? FALSE : TRUE;
}
UBool does_var_has_ref(const char *const var_name /*c-string*/){
int i = get_var_name_index(var_name);
if (i == -1){
return FALSE;
} else {
return refs[i] == NULL ? FALSE : TRUE;
}
}
Node * get_var_ref(const char *const var_name /*c-string*/){
int i = get_var_name_index(var_name);
if (i == -1){
printf("name NOT exist: %s\n", var_name);
return NULL;
} else {
if (refs[i]){
//printf("name and ref exist: %s\n", var_name);
} else {
printf("name exist, ref NOT exist: %s\n", var_name);
}
return refs[i];
}
}
void put_var(const char *const var_name, Node *const var_ref = NULL){
int i = get_var_name_index(var_name);
if (i == -1 && var_name !=NULL){ // new variable
int offset = name_buffer.content_size();
name_buffer.append_array(var_name, strlen(var_name) + 1);
names.append(offset);
refs.append(var_ref);
} else {
if(refs[i] == NULL && var_ref != NULL){ // exist variable, no ref
refs[i] = var_ref; // link definition with variable
};
}
}
void reset(){
names.reset();
name_buffer.reset();
// release memory here
int s = refs.content_size();
for (int i=0; i < s; i++){
delete refs[i];
}
refs.reset();
}
private:
Buffer_int names; // indexes in name_buffer
Buffer_pNode refs;
Buffer_char name_buffer; // var names storage space
int get_var_name_index(const char *const var_name){
int len = names.content_size();
for (int i=0; i< len; i++){
if (strcmp(var_name, &name_buffer + names[i]) == 0){
return i;
}
}
return -1;
}
};
class LiteralNode : public Node {
public:
virtual const char* getTargetString(){
return str;
}
LiteralNode(const char * s /*c-string*/){
str.append_array(s, strlen(s) + 1);
}
private:
Buffer_char str; //null-terminated c-string
};
class VariableNode : public Node {
public:
virtual const char* getTargetString(){
link();
if (var_ref == NULL) {
return ""; // constant string has global life-cycle
}
return var_ref->getTargetString();
}
VariableNode(const char * var_name, NodeSymbolTable * symbols):symbols(*symbols){
this->var_name.append_array(var_name, strlen(var_name) + 1);
this->var_ref = NULL;
}
UBool link(){
if (var_ref == NULL) {
var_ref = &symbols == NULL ? NULL : symbols.get_var_ref(var_name);
return var_ref != NULL;
}
return TRUE;
}
private:
Buffer_char var_name;
Node * var_ref;
NodeSymbolTable & symbols;
};
class Magic_SelectOneChar : public Node{
public:
virtual const char* getTargetString(){
return &set + rand() % len;
}
Magic_SelectOneChar( const char * set /*char set*/): len(strlen(set)){
this->set.append_array(set, len);
}
private:
Buffer_char set;
const int len;
};
class MagicNode : public Node {
public:
virtual const char* getTargetString(){
return "aaa";
return l(select_an_string());
}
private:
LiteralToEscape l;
Buffer_char str;
// compose a string with lenght {1, 5}
const char * select_an_string(){
int r = rand();
r %= 5;
r += 1; // shift 0..4 to 1..5
str.reset();
for (int i=0; i < r; i++){
str.append(select_an_char());
}
str.append(0);
return &str;
}
// randomly select a char from a set
char select_an_char(){
static const char *const set = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[]&<";
static const int len = strlen(set);
int i = rand()%len;
return set[i];
}
};
class SequenceNode : public Node {
public:
virtual const char* getTargetString(){
str.reset();
int l = items.content_size();
for(int i=0; i < l; i++){
const char * temp = items[i]->getTargetString();
str.append_array(temp, strlen(temp));
}
str.append(0); // terminal null
return str;
}
void append (Node * node){
items.append(node);
}
virtual ~SequenceNode(){
int l = items.content_size();
for(int i=0; i < l; i++){
//How can assure the item is got from heap?
//Let's assume it.
delete items[i];
}
}
private:
Buffer_pNode items;
Buffer_char str; //null-terminated c-string
};
class RepeatNode : public Node {
public:
virtual const char* getTargetString(){
str.reset();
for(int i=0; i< select_a_count(); i++){
const char * temp = item->getTargetString();
str.append_array(temp, strlen(temp));
}
str.append(0);
return str;
}
RepeatNode(Node * item, int min_count =0, int max_count = 1){
this->item = item;
this->min_count = min_count;
this->max_count = max_count;
}
virtual ~RepeatNode(){
delete item; // We assume its space is got from heap
}
private:
Node * item;
Buffer_char str;
int min_count;
int max_count;
int select_a_count(){
int t = max_count - min_count + 1;
return min_count + rand()%(t);
}
};
class AlternationNode : public Node {
public:
virtual const char* getTargetString(){
str.reset();
int i = select_an_item();
const char * temp = items[i]->getTargetString();
str.append_array(temp, strlen(temp));
str.append(0);
return str;
}
virtual ~AlternationNode(){
int l = items.content_size();
for(int i=0; i < l; i++){
delete items[i]; // We assume its space is got from heap
}
}
protected:
Buffer_pNode items;
private:
Buffer_char str; // null-terminated c-string
// Select an item randomly and add it to target string
virtual int select_an_item() = 0;
};
class Alternation1Node : public AlternationNode{
public:
void append (Node * node){
items.append(node);
}
private:
int select_an_item(){
int entries = items.content_size();
int i = rand()%entries;
return i;
}
};
class Alternation2Node : public AlternationNode{
public:
void append (Node * node, int weight){
items.append(node);
weights.append(weight);
total += weight;
}
Alternation2Node():total(0){}
private:
Buffer_int weights;
double total;
// Select an item randomly. Hight weight item has more chance to be selected.
//
// +____+_+___+______+ <- total weight
// ^mark \__ one item
//
// We use following method to select an item.
// 1.locate a point in total weight randomly --> mark
// every weight has equal chance to be select
// 2.mark can identify an item --> item
// hight weight has more chance to be selected.
//
int select_an_item(){
double reference_mark = (double)rand()/ (double)RAND_MAX;
double mark = total * reference_mark;
int i=0;
for (;;){
mark -= weights[i];
if (mark <= 0)
break;
i++;
}
return i;
}
};
class Parser{
public:
Parser(const char * source, NodeSymbolTable * symbols):s(source), symbols(*symbols){
}
UBool parse(){
return rules();
}
private:
Scanner s;
TokenType token;
NodeSymbolTable & symbols;
UBool match(TokenType expected){
if (token == expected) {
token = s.getNextToken();
return TRUE;
} else {
//s.dumpCurrentPoint();
return FALSE;
}
}
UBool rules(){
symbols.reset();
token = s.getNextToken();
while (rule()){
}
if (token == STREAM_END){
return TRUE;
} else {
s.dumpCurrentPoint();
return FALSE;
}
}
UBool rule(){
if (token == VAR){
Buffer_char name;
name.append_array(s.tokenBuffer, strlen(s.tokenBuffer));
name.append(0);
match(VAR);
if (match(EQ)){
Node * t = NULL;
if(defination(t)){
symbols.put_var(name, t);
return match(SEMI);
}
}
}
return FALSE;
}
UBool defination(Node* &node /*in,out*/){
if (node != NULL) return FALSE;
//assert node == NULL
if (simple(node)){
if (token == WEIGHT){
return alternation2(node);
} else {
return alternation1(node);
}
}
return FALSE;
}
UBool alternation2(Node * &node /*in,out*/){
if (node == NULL) return FALSE;
//assert node != NULL, and is simple node
int w;
if (!weight(w)){
delete node;
node = NULL;
return FALSE;
}
// Raymond: (For interest and study purpose)
// We accept alternation2 with only one item, although I do think it is meanfull.
//
// Single item alternation2 should equal to "a simple without weight" rather than a short-alt
//
// Another reasone is, we think 'weight' should be owned by alternation2 rather than item2 itself.
Alternation2Node * t = new Alternation2Node();
t->append(node, w);
node = NULL; // Logically, it has nothing
Node * temp = NULL; // We can use 'node' as temp variable, but its name is uncomfortable
while (token == BAR){
match(BAR);
if (simple(temp)){
if (weight(w)){
t->append(temp, w);
} else {
delete temp;
goto FAIL;
}
temp = NULL; // Logically, it has nothing now
} else {
goto FAIL;
}
}
if (token == SEMI || token == RPAR){
node = t; // A whole new node
return TRUE;
}
// for example, this is illegal: a 4% | b 5% c
FAIL:
delete t; // fall down...
return FALSE;
}
UBool weight(int & w){
if (token == WEIGHT){
w = atoi(s.tokenBuffer);
match(WEIGHT);
return TRUE;
}
return FALSE;
}
UBool alternation1(Node * &node){
if (!sequence(node)){
return FALSE;
}
if (token == BAR){ // detected a real alternation1, create it.
return alternation1_open(node);
} else { // just something with higher precedence, not a alternation1
return TRUE;
}
}
UBool alternation1_open(Node * &node){
if (node == NULL) return FALSE;
// assert node != NULL, and node is sequence or simpler thing
Alternation1Node * t = new Alternation1Node();
t->append(node);
node = NULL; // Logically, it has nothing
Node * temp = NULL; // We can use 'node' as temp variable, but its name is uncomfortable
// We can use either recursion (linking node) or loop (plain array) to create the list
// Here, we chosse loop (plain array).
while (token == BAR){
match(BAR);
if(sequence(temp)){
t->append(temp);
temp = NULL;
} else {
goto FAIL;
}
}
if (token == SEMI || token == RPAR){
node = t;
return TRUE;
}
FAIL:
delete t;
return FALSE;
}
UBool sequence(Node* &node){
if (!item(node)) {
return FALSE;
}
if (token == VAR || token == STRING || token == LPAR){ // maybe an item
return sequence_open(node);
} else { // just something with higher precedence.
return TRUE;
}
}
UBool sequence_open(Node* &node){
if (node == NULL) return FALSE;
// assert node != NULL, and node is item (simple, repeat, or short-alt)
SequenceNode* t = new SequenceNode();
t->append(node);
node = NULL; // Logically, it has nothing
Node * temp = NULL; // We can use 'node' as temp variable, but its name is uncomfortable
while (token == VAR || token == STRING || token == LPAR){ // maybe a simple
if (item(temp)){
t->append(temp);
temp = NULL;
} else {
goto FAIL;
}
}
// ILLEGAL: a c 5%
if (token == SEMI || token == RPAR || token == BAR){
node = t;
return TRUE;
}
FAIL:
delete t;
return FALSE;
}
UBool item(Node *& node /*out*/){
if (node != NULL){
// assert node is simple
// go on
} else {
if (simple(node)){
// go on
} else {
return FALSE;
}
}
// assert node != NULL, node is simple
switch (token){
case RANG_START:
return repeat(node);
case QUESTION_MARK:
return short_alt(node);
default:
return TRUE; // bare simple
}
}
// get a 'simple node'
UBool simple(Node* &node /*out*/){
if (node != NULL) return FALSE;
//assert node == NULL
switch(token){
case LPAR:
match(LPAR);
if(defination(node) && match(RPAR)){
return TRUE;
}
return FALSE;
case VAR:
node = new VariableNode(s.tokenBuffer, &symbols);
match(VAR);
return TRUE;
case STRING:
node = new LiteralNode(s.tokenBuffer);
match(STRING);
return TRUE;
default:
return FALSE;
}
}
//upgrade a 'simple node' to 'repeat node'
UBool repeat (Node* &node /*in,out*/){
if (node == NULL) return FALSE;
//assert node != NULL, node is simple
if (match(RANG_START) && token == NUMBER){
int min = atoi(s.tokenBuffer);
match(NUMBER);
if(match(COMMA) && token == NUMBER){
int max = atoi(s.tokenBuffer);
match(NUMBER);
if(match(RANG_END)){
Node * t = node;
node = new RepeatNode(t, min, max);
return TRUE;
}
}
}
delete node;
node = NULL;
return FALSE;
}
//upgrade a 'simple node' to 'short-alt node'
UBool short_alt (Node* &node /*in,out*/){
if (node == NULL) return FALSE;
//assert node != NULL, node is simple
if (match(QUESTION_MARK)){
int exist_weight = 50;
if (token == WEIGHT){
exist_weight = atoi(s.tokenBuffer);
match(WEIGHT);
}
int null_weight = 100 - exist_weight;
Node * t1 = node;
Node * t2 = new LiteralNode("");
Alternation2Node * t = new Alternation2Node();
t->append(t1, exist_weight);
t->append(t2, null_weight);
node = t;
return TRUE;
}
delete node;
node = NULL;
return FALSE;
}
}; // class Parser
class RandomLanguageGenerator{
public:
//NOTE: start cannot be a magic node
RandomLanguageGenerator(const char *const bnf_definition,
const char *const start,
const char *const magic_name = NULL,
Node *const magic_ref = NULL){
srand((unsigned)time( NULL ));
// our random sequence is start from here.
// side effect: It's a global C function!
Parser p(bnf_definition, &symbols);
if (!p.parse()) {return;} // how can we break when encounter error?
root = symbols.get_var_ref(start);
put_magic(magic_name, magic_ref);
}
void put_magic(const char *const magic_name, Node *const magic_ref){
symbols.put_var(magic_name, magic_ref);
}
// Return a null-terminated c-string. The buffer is owned by callee.
const char * get_a_string(){
return root->getTargetString();
}
private:
Node * root;
NodeSymbolTable symbols;
};
UBool TestScanner(void){
//const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};";
//const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}",
// "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"};
const char str2[] = "$p2 =('\\' $s $string $s)? 25%;";
const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25%", ";"};
const char *str = str2;
const char (*str_r)[20] = str2_r;
int tokenNum = sizeof(str2_r)/sizeof(char[20]);
Scanner t(str);
UBool pass = TRUE;
t.getNextToken();
int i = 0;
while (pass){
if (t.tokenType == STREAM_END){
pass = pass? i == tokenNum : FALSE;
break;//while
} else if (t.tokenType == ERROR){
pass = FALSE;
break;//while
} else {
pass = strcmp(t.tokenBuffer, str_r[i++]) == 0 ;
t.getNextToken();
}
}
if (pass){
printf("TestScanner passed.\n");
} else {
printf("TestScanner FAILED!!!\n");
t.dumpCurrentPoint();
}
return pass;
}
UBool TestLiteralizer(){
const char *const str = "This ' A !,z| qq [] .new\tline";
const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline";
////
//// :( we must quote our string to following C syntax
//// cannot type the literal here, it makes our code rather human unreadable
//// very very unconformable!
////
///*
//*/
//const char *const s1 = "ab'c";
//const char (* s1_r1) [] = { "ab''c", // ab''c
// "ab\\'c", // ab\'c
// };//
///*
// . '.' \.
// .. \.\. '.'\. '.'\. '..' // '.''.' wrong
//*/
//const char *const s2 = "a..'.b"; // a..'.b
//const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b
// ,"a'..\\'.'b" // a'..\'.'b
// ,"a'..'\\''.'b" // a'..'\''.'b
// };//
//const char *const s3 = "a..\\.b"; // a..\.b
//const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b
// ,"a'..'\\\\'.'b" // a'..'\\'.'b
// };//
// // no catact operation, no choice, must be compact
srand((unsigned)time( NULL ));
//LiteralToEscape l(LiteralToEscape::NO, LiteralToEscape::NO, LiteralToEscape::RAND_ESC);
LiteralToEscape l;
printf("\n========TestLiteralier start=======\n");
printf(str);
printf("\n-----------------------------------\n");
//printf(r);
for (int i=0; i<10; i++){
const char * s = l(str);
fwrite(s, strlen(s), 1, stdout);
printf("\n");
}
printf("\n~~~~~~~~TestLiteralier end~~~~~~~~~~\n");
// UBool pass = strcmp(str_r,l(str)) == 0;
//if (pass){
// printf("TestLiteralier passed.\n");
// } else {
// printf("TestLiteralier FAILED!!!\n");
// }
// return pass;
return FALSE;
}
UBool TestLiteralNode(){
const char * s = "test string99.";
LiteralNode n(s);
const char * r = n.getTargetString();
UBool pass = strcmp(s,r) == 0;
if (pass){
printf("TestLiteralNode passed.\n");
} else {
printf("TestLiteralNode FAILED!!!\n");
}
return pass;
}
UBool TestMagicNode(){
MagicNode n;
printf("\n========TestMagicNode start=======\n");
for (int i=0; i < 10 ; i++){
printf(n.getTargetString());
printf("\n------------------\n");
}
printf("\n~~~~~~~~TestMagicNode end~~~~~~~~~~\n");
return FALSE;
}
UBool TestSequenceNode(){
SequenceNode n;
LiteralNode * n1 = new LiteralNode("abc ");
LiteralNode * n2 = new LiteralNode(", s");
n.append(n1);
n.append(n2);
const char * r = n.getTargetString();
char * s = "abc , s";
UBool pass = strcmp(s,r) == 0;
if (pass){
printf("TestSequenceNode passed.\n");
} else {
printf("TestSequenceNode FAILED!!!\n");
}
return pass;
}
UBool TestAlternation1Node(){
srand((unsigned)time( NULL ));
Alternation1Node n;
LiteralNode * a = new LiteralNode("a");
LiteralNode * b = new LiteralNode("b");
LiteralNode * c = new LiteralNode("c");
LiteralNode * d = new LiteralNode("c");
n.append(a);
n.append(b);
n.append(c);
n.append(d);
printf("\n========= TestAlternation1Node =============\n");
for(int i=0; i<10; i++){
printf(n.getTargetString());
printf("\n");
}
printf("~~~~~~~~~ TestAlternation1Node ~~~~~~~~~~~~~\n");
return FALSE;
}
UBool TestAlternation2Node(){
srand((unsigned)time( NULL ));
Alternation2Node n;
LiteralNode * n1 = new LiteralNode("boy");
LiteralNode * n2 = new LiteralNode("gggirl");
n.append(n1,10);
n.append(n2,20);
printf("\n========= TestAlternation2Node = 10, 20 =====\n");
for(int i=0; i<10; i++){
printf(n.getTargetString());
printf("\n");
}
printf("~~~~~~~~~ TestAlternation2Node ~~~~~~~~~~~~~\n");
return FALSE;
}
UBool TestRepeatNode(){
srand((unsigned)time( NULL ));
LiteralNode * n1 = new LiteralNode("abc ");
RepeatNode n(n1, 1, 4);
printf("\n========= TestRepeatNode =============\n");
for(int i=0; i<10; i++){
printf(n.getTargetString());
printf("\n");
}
printf("~~~~~~~~~ TestRepeatNode ~~~~~~~~~~~~~\n");
return FALSE;
}
UBool TestVariableNode(){
printf("\n========TestVariableNode===========\n");
VariableNode n("aaa", NULL);
printf(n.getTargetString());
printf("\n~~~~~~~~~ TestVariableNode ~~~~~~~~~~~~~\n");
return FALSE;
}
UBool TestSymbolTable(){
LiteralNode * n1 = new LiteralNode("uvw");
LiteralNode * n2 = new LiteralNode("xyz");
NodeSymbolTable t;
t.put_var("abc", n1);
t.put_var("$aaa", n2);
t.put_var("bbb");
UBool pass;
pass = t.is_var_exist("abc");
pass = pass && t.is_var_exist("$aaa");
pass = pass && t.is_var_exist("bbb");
pass = pass && !t.is_var_exist("ccc");
pass = pass && t.does_var_has_ref("abc");
pass = pass && t.does_var_has_ref("$aaa");
pass = pass && !t.does_var_has_ref("bbb");
pass = pass && !t.does_var_has_ref("zz");
t.reset();
pass = pass && !t.does_var_has_ref("abc");
if (pass){
printf("TestSymbolTable passed.\n");
} else {
printf("TestSymbolTable FAILED!!!\n");
}
return pass;
}
UBool TestParser1(){
const char *const str1 =
"$s = ' ' ? 50%;"
//"$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';"
"$p1 = ($string $s '|' $s)? 25%;"
"$p2 = ('\\' $s $string $s)? 25%;"
"$rel2 = $p1 $string $s $p2;"
"$relation = $relationList $s ($rel1 | $rel2) $crlf;"
"$command = $commandList $crlf;"
//Raymond: Test code in Java source should be fixed to adapt current syntax
"$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;"
"$mostRules = $command 1% | $reset 5% | $relation 25%;"
"$root = $command{0,5} $reset $mostRules{1,20};"
//"$x = ($var {1,2}) 3%;" // legal.
//"$x = $var {1,2} 3% | b 4%;" // illegal. 3%
//"$x = $var {1,2} 3%;" // illegal. 3%
//"$m = $c ? 2% 4% | $r 5% | $n 25%;" // should failed at '4%'
//"$a = b ? 2% | c 5%;" // should failed at '5%'
//"$x = A B 5% C 10% | D;" // illegal. 5%
//"$x = aa 45% | bb 5% cc;" // illegal. cc
//"$x = (b 5%) (c 6%);" // legal.
//"$x = (b 5%) c 6%;" // legal? illegal.
//"$x = b 5% (c 6%);" // legal? illegal.
//"$x = b 5% c 6%;" // legal? illegal, should failed at 'c'
//"$x = b 5%;" // legal
//"$x = aa 45% | bb 5% cc;" // should failed at 'cc'
//"$x = a | b | c 4% | d 5%;" // should failed at '4%'
//"$s = ' ' ? 50% abc;" // legal.
;
NodeSymbolTable symbol_table;
Parser p(str1, &symbol_table);
UBool pass = p.parse();
symbol_table.reset();
if (pass){
printf("TestParser passed.\n");
} else {
printf("TestParser FAILED!!!\n");
}
return pass;
}
UBool TestRandomLanguageGenerator(){
const char *const def =
"$a = $b;"
"$b = $c;"
"$c = $t;"
"$t = abc z{2,2};"
"$k = a | b | c | d | e | f | g ;"
"$z = a 0% | b 1% | c 10%;"
; // end of string
const char * s = "abczz";
//RandomLanguageGenerator g(def, "$a");
RandomLanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode());
printf("\n_________ TestRandomLanguageGenerator _____________\n");
for (int i= 0; i< 5; i++){
//for (int j = 0; j < 99999999; j++);
const char * r = g.get_a_string();
fwrite(r, strlen(r), 1, stdout);
printf("_____________________________________________________\n");
}
printf("~~~~~~~~~ TestRandomLanguageGenerator ~~~~~~~~~~~~~\n");
return FALSE;
////UBool pass = strcmp(s,r) == 0;
//if (pass){
// printf("TestRandomLanguageGenerator passed.\n");
//} else {
// printf("TestRandomLanguageGenerator FAILED!!!\n");
//}
//return pass;
}
void RandomCollatorTest::Test2(){
TestScanner();
TestLiteralizer();
TestLiteralNode();
TestMagicNode();
TestSequenceNode();
TestAlternation1Node();
TestAlternation2Node();
TestRepeatNode();
TestVariableNode();
TestSymbolTable();
TestParser1();
TestRandomLanguageGenerator();
}
void RandomCollatorTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par){
if (exec) logln("TestSuite RandomCollatorTest: ");
switch (index) {
TESTCASE(0, Test);
default: name = ""; break;
}
}
void RandomCollatorTest::Test(){
RandomLanguageGenerator test_rule(collationBNF, "$root", "$magic", new MagicNode());
//class TestColltorCompare{
//public:
// UBool operator()(Collator &coll, int count = 1000){
// UnicodeString a(test_string.get_a_string());
// UnicodeString b(test_string.get_a_string());
// UnicodeString c(test_string.get_a_string());
// do{
// if (check_transitivity(coll, a, b, c)){
// a = b;
// b = c;
// c = UnicodeString(test_string.get_a_string());
// }
// }while(count-- >= 0 );
// return FALSE;
// }
// TestColltorCompare():test_string("$s = $c{1,8};", "$s", "$c", new Magic_SelectOneChar("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[]&<")){
// }
//private:
// UBool check_transitivity(const Collator & coll, const UnicodeString &a, const UnicodeString &b, const UnicodeString &c){
// int ab = coll.compare(a,b), ba = coll.compare(b,a);
// int bc = coll.compare(b,c), cb = coll.compare(c,b);
// int ca = coll.compare(c,a), ac = coll.compare(a,c);
// // a
// // / \
// // b - c
// //
// if (//counter-clockwise, maximum
// (ab >=0 && bc >=0 && ac <0)
// ||(bc >=0 && ca >=0 && ba <0)
// ||(ca >=0 && ab >=0 && cb <0)
// //counter-clockwise, minimum
// ||(ab <=0 && bc <=0 && ca >0)
// ||(bc <=0 && ca <=0 && ba >0)
// ||(ca <=0 && ab <=0 && cb >0)
// ){
// return FALSE;
// }
// return TRUE;
// }
// RandomLanguageGenerator test_string;
//} coll_test;
static const int CONSTRUCT_RANDOM_COUNT = 10;
int i;
for (i=0; i < CONSTRUCT_RANDOM_COUNT; i++){
const char * rule = test_rule.get_a_string();
logln("\n-----------------------------------%d\n",i);
logln(UnicodeString(rule, strlen(rule)));
UnicodeString newRule(rule); // potential bug
UErrorCode status = U_ZERO_ERROR;
Collator * c = new RuleBasedCollator(newRule,status);
if (U_FAILURE(status)) {
errln( "Could not create Collator for rules at %d. Error: %s\nRule is: %s\n", i, u_errorName(status), rule);
return;
}
delete c;
}
}
#endif /* #if !UCONFIG_NO_COLLATION */