a19e0c9c4a
X-SVN-Rev: 18906
1346 lines
39 KiB
C++
1346 lines
39 KiB
C++
/*
|
|
******************************************************************************
|
|
* Copyright (C) 2005, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <time.h>
|
|
|
|
#include "wbnf.h"
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
//
|
|
// Constants and the most basic helper classes
|
|
//
|
|
|
|
static const char DIGIT_CHAR[] = "0123456789";
|
|
static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0};
|
|
static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
|
|
|
|
static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){
|
|
const char * p = list;
|
|
for (;*p != 0 && *p != c; p++);
|
|
return *p?TRUE:FALSE;
|
|
}
|
|
static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);}
|
|
static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);}
|
|
static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);}
|
|
static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
//
|
|
// Helper classes
|
|
//
|
|
|
|
class Buffer_byte{
|
|
// Utility class, can be treated as an auto expanded array. no boundary check.
|
|
|
|
typedef char byte;
|
|
byte * start;
|
|
byte * current;
|
|
int buffer_size; // size unit is byte
|
|
public:
|
|
inline int content_size(){return current - start;} // size unit is byte
|
|
|
|
private:
|
|
inline void expand(int add_size = 100){ // size unit is byte
|
|
int new_size = buffer_size + add_size;
|
|
|
|
int cs_snap = content_size();
|
|
start = (byte *) realloc(start, new_size); // may change the value of start
|
|
current = start + cs_snap;
|
|
|
|
memset(current, 0, add_size);
|
|
buffer_size = new_size;
|
|
}
|
|
|
|
inline void expand_to(int size){
|
|
int r = size - buffer_size;
|
|
if (r > 0) {
|
|
expand(r); // simply expand, no block alignment
|
|
}
|
|
}
|
|
Buffer_byte(const Buffer_byte &);
|
|
Buffer_byte & operator = (const Buffer_byte &);
|
|
public:
|
|
Buffer_byte():start(NULL),current(start),buffer_size(0){
|
|
expand();
|
|
}
|
|
~Buffer_byte(){
|
|
free(start);
|
|
}
|
|
|
|
inline void reset(){
|
|
start != NULL ? memset(start, 0, buffer_size) : 0;
|
|
current = start;
|
|
}
|
|
|
|
// Using memory copy method to append a C array to buffer,
|
|
inline void append(const void * c, int size){ // size unit is byte
|
|
expand_to(content_size() + size) ;
|
|
memcpy(current, c, size);
|
|
current = current + size;
|
|
}
|
|
|
|
byte * buffer(){
|
|
return start;
|
|
}
|
|
};
|
|
|
|
/*
|
|
The class(es) try to work as bulid-in array, so it overloads these two operators
|
|
operator type *();
|
|
type & operator[];
|
|
The first is used to auto type convert, the latter is used to select member.
|
|
|
|
A small trick is the class does not overload the address-of operator. This
|
|
behavior is different from bulid-in array, but it give us the opportunity
|
|
to get the address of the class itself.
|
|
*/
|
|
//template<typename type>
|
|
// class BUFFER{
|
|
// typedef BUFFER name;
|
|
#define BUFFER(type, name)\
|
|
class name {\
|
|
private:\
|
|
Buffer_byte buf;\
|
|
public:\
|
|
name & reset() {buf.reset(); return *this;}\
|
|
name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\
|
|
name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\
|
|
type & operator [] (int i) { return ((type *) buf.buffer())[i];}\
|
|
operator type *(){return (type *) buf.buffer();} \
|
|
int content_size(){return buf.content_size() / sizeof(type);}\
|
|
};
|
|
|
|
|
|
class Pick{
|
|
/* The Pick is the basic language generator element*/
|
|
public:
|
|
// generate a string accroding the syntax
|
|
// Return a null-terminated c-string. The buffer is owned by callee.
|
|
virtual const char* next() = 0;
|
|
virtual ~Pick(){};
|
|
};
|
|
|
|
//typedef BUFFER<char> Buffer_char;
|
|
//typedef BUFFER<int> Buffer_int;
|
|
//typedef BUFFER<Pick *> Buffer_pPick;
|
|
BUFFER(char, Buffer_char);
|
|
BUFFER(int, Buffer_int);
|
|
BUFFER(Pick *, Buffer_pPick);
|
|
|
|
class SymbolTable{
|
|
/* Helper class.
|
|
* It's a mapping table between 'variable name' and its 'active Pick object'
|
|
*/
|
|
private:
|
|
Buffer_char name_buffer; // var names storage space
|
|
|
|
Buffer_int names; // points to name (offset in name_buffer)
|
|
Buffer_pPick refs; // points to Pick
|
|
|
|
int get_index(const char *const var_name){
|
|
int len = names.content_size();
|
|
for (int i=0; i< len; i++){
|
|
if (strcmp(var_name, name_buffer + names[i]) == 0){
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
public:
|
|
enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF};
|
|
|
|
RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){
|
|
if (!var_name) return EMPTY; // NULL name
|
|
|
|
int i = get_index(var_name);
|
|
if (i == -1){
|
|
return NO_VAR; // new name
|
|
}
|
|
if (!refs[i]){ // exist name, no ref
|
|
return NO_REF;
|
|
} else {
|
|
if (ref) {
|
|
*ref = refs[i];
|
|
}
|
|
return HAS_REF; // exist name, has ref
|
|
}
|
|
}
|
|
|
|
void put(const char *const var_name, Pick *const var_ref = NULL){
|
|
int i = get_index(var_name);
|
|
switch(find(var_name)){
|
|
case EMPTY: // NULL name
|
|
break;
|
|
case NO_VAR: // new name
|
|
int offset;
|
|
offset = name_buffer.content_size();
|
|
name_buffer.append_array(var_name, strlen(var_name) + 1);
|
|
names.append(offset);
|
|
refs.append(var_ref);
|
|
break;
|
|
case NO_REF: // exist name, no ref
|
|
refs[i] = var_ref; // link definition with variable
|
|
break;
|
|
case HAS_REF: // exist name, has ref
|
|
if (var_ref){
|
|
refs[i] = var_ref;
|
|
}
|
|
break;
|
|
default:
|
|
; // ASSERT(FALSE);
|
|
}
|
|
return;
|
|
}
|
|
|
|
UBool is_complete(){
|
|
int n = names.content_size();
|
|
for (int i=0; i<n; ++i){
|
|
if (refs[i] == NULL){
|
|
return FALSE;
|
|
}
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
void reset(){
|
|
names.reset();
|
|
name_buffer.reset();
|
|
|
|
// release memory here
|
|
int s = refs.content_size();
|
|
for (int i=0; i < s; i++){
|
|
delete refs[i]; // TOFIX: point alias/recursion problem
|
|
}
|
|
refs.reset();
|
|
}
|
|
|
|
~SymbolTable(){
|
|
reset();
|
|
}
|
|
};
|
|
|
|
|
|
|
|
// Document of class Escaper
|
|
//
|
|
// ATTENTION:
|
|
// From http://icu.sourceforge.net/userguide/Collate_Customization.html.
|
|
// We get the precedence of escape/quote operations
|
|
//
|
|
// (highest) 1. backslash \
|
|
// 2. two single quotes ''
|
|
// 3. quoting ' '
|
|
//
|
|
// ICU Collation should accept following as the same string.
|
|
//
|
|
// 1) 'ab'c _
|
|
// 2) a\bc \
|
|
// 3) a'b'\c |- They are equal.
|
|
// 4) abc _/
|
|
//
|
|
// From "two single quotes", we have following deductions
|
|
// D1. empty quoting is illgal. (obviously)
|
|
// D2. no contact operation between two quotings
|
|
// '.''.' is not .. it is .'.
|
|
// D3. "two single quotes" cannot contact two quoting simultaneously
|
|
// '..''''.' is not ..'. it is ..''.
|
|
// NOTICE:
|
|
// "two single quotes" can contact before one quoting
|
|
// '''.' is '.
|
|
// "two single quotes" can literally contact after one quoting
|
|
// But, from syntax, it's one quoting including a "two single quotes"
|
|
// '.''' is .'
|
|
// D4. "two single quotes" cannot solely be included in quoting
|
|
// '''' is not ' it is ''
|
|
// NOTICE: These are legal
|
|
// '.''.' is .'.
|
|
// '.''' is .'
|
|
//
|
|
// dicision
|
|
// /\
|
|
// /__\
|
|
// output buffer input buffer
|
|
//
|
|
// To make our dicision (within an atom operation) without caring input and output buffer,
|
|
// following calling pattern (within an atom operation) shall be avoided
|
|
//
|
|
// P1 open_quoting() then close_quoting() (direct violation) D1
|
|
// P2 close_quoting() then open_quoting() (direct violation) D2
|
|
// P3 empty open_quoting() (indirect violation) D1, D4
|
|
// P4 empty close_quoting() (indirect violation) D2, D3
|
|
// P5 open_quoting() then two single quotes (indirect violation) D4
|
|
// P6 close_quoting() then two single quotes (indirect violation) D3
|
|
//
|
|
// two single quotes escaping will not open_ or close_ quoting()
|
|
// The choice will not lose some quoing forms.
|
|
//
|
|
// For open_quoting(),
|
|
// we may get this form quoting ''' P5
|
|
// It may raise a bug ''''x
|
|
// If we expect
|
|
// '''.' let the next char open the quoting
|
|
// '.''.' the quoting is already opened by preceding char
|
|
//
|
|
// For close_quoting()
|
|
// we will get this form quoting '.''' P6
|
|
// It may raise a bug '.''''.'
|
|
// If we expect
|
|
// '.'''\. let the next char close the quoting
|
|
// '.''''.' the expectation is wrong! using '.'\''.' instead
|
|
//
|
|
// It's a hard work to re-adjust generation opportunity for various escaping form.
|
|
// We just simply ignore it.
|
|
class Escaper{
|
|
public:
|
|
enum CHOICE {YES, NO, RAND};
|
|
enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC};
|
|
private:
|
|
class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class
|
|
private:
|
|
const CHOICE tag;
|
|
public:
|
|
Bool(CHOICE flag=RAND):tag(flag){}
|
|
operator UBool() { // conversion operator
|
|
return tag == RAND ? rand()%2 : tag == YES;
|
|
//if (tag == RAND){
|
|
// return rand()%2 == 1;
|
|
//} else {
|
|
// return tag == YES ? TRUE : FALSE;
|
|
//}
|
|
}
|
|
};
|
|
public:
|
|
Escaper(CHOICE escapeLiteral = RAND,
|
|
CHOICE twoQuotesEscape = RAND,
|
|
ESCAPE_FORM escapeForm = RAND_ESC):
|
|
escape_form(escapeForm),
|
|
escape_literal(escapeLiteral),
|
|
two_quotes_escape(twoQuotesEscape),
|
|
is_quoting(FALSE){}
|
|
private:
|
|
Buffer_char str;
|
|
ESCAPE_FORM escape_form;
|
|
Bool escape_literal;
|
|
Bool two_quotes_escape;
|
|
UBool quote_escape;
|
|
UBool bslash_escape;
|
|
UBool is_quoting;
|
|
|
|
void set_options(){
|
|
ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form;
|
|
switch (t){
|
|
case BSLASH_ONLY :
|
|
bslash_escape = TRUE; quote_escape = FALSE; break;
|
|
case QUOTE_ONLY:
|
|
bslash_escape = FALSE;quote_escape = TRUE; break;
|
|
case QUOTE_AND_BSLAH:
|
|
bslash_escape = TRUE; quote_escape = TRUE; break;
|
|
default:
|
|
;// error
|
|
}
|
|
}
|
|
|
|
void reset(){
|
|
str.reset();
|
|
is_quoting = FALSE;
|
|
}
|
|
|
|
inline void open_quoting(){
|
|
if(is_quoting){
|
|
// do nothing
|
|
} else {
|
|
str.append('\'');
|
|
is_quoting = TRUE;
|
|
}
|
|
}
|
|
inline void close_quoting(){
|
|
if(is_quoting){
|
|
str.append('\'');
|
|
is_quoting = FALSE;
|
|
} else {
|
|
// do nothing
|
|
}
|
|
}
|
|
|
|
// str [in] null-terminated c-string
|
|
void append(const char * strToAppend){
|
|
for(;*strToAppend != 0; strToAppend++){
|
|
append(*strToAppend);
|
|
}
|
|
}
|
|
|
|
inline void append(const char c){
|
|
set_options();
|
|
|
|
if (c == '\\'){
|
|
quote_escape ? open_quoting() : close_quoting();
|
|
//bslash_escape always true here
|
|
str.append('\\');
|
|
str.append('\\');
|
|
} else if (c == '\''){
|
|
if (two_quotes_escape){ // quoted using two single quotes
|
|
// See documents in anonymous.design
|
|
str.append('\'');
|
|
str.append('\'');
|
|
} else{
|
|
quote_escape ? open_quoting() : close_quoting();
|
|
//bslash_escape always true here
|
|
str.append('\\');
|
|
str.append('\'');
|
|
}
|
|
} else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){
|
|
quote_escape ? open_quoting() : close_quoting();
|
|
if (bslash_escape) str.append('\\');
|
|
str.append(c);
|
|
} else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal
|
|
if (escape_literal){
|
|
quote_escape ? open_quoting() : close_quoting();
|
|
if (bslash_escape) str.append('\\');
|
|
str.append(c);
|
|
} else {
|
|
close_quoting();
|
|
str.append(c);
|
|
}
|
|
}
|
|
}
|
|
|
|
public:
|
|
// Return a null-terminate c-string. The buffer is owned by callee.
|
|
char * operator()(const char * literal /*c-string*/){
|
|
str.reset();
|
|
for(;*literal != 0; literal++){
|
|
append(*literal);
|
|
}
|
|
close_quoting(); // P4 exception, to close whole quoting
|
|
return str;
|
|
}
|
|
};
|
|
|
|
class WeightedRand{
|
|
// Return a random number in [0, size)
|
|
// Every number has different chance (aka weight) to be selected.
|
|
private:
|
|
Buffer_int weights;
|
|
double total;
|
|
WeightedRand(const WeightedRand &);
|
|
WeightedRand & operator = (const WeightedRand &);
|
|
public:
|
|
WeightedRand(Buffer_int * weight_list = NULL, int size = 0){
|
|
if ( weight_list == NULL){
|
|
for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT);
|
|
} else {
|
|
int s = weight_list->content_size();
|
|
if (s < size){
|
|
weights.append_array( (*weight_list),s);
|
|
for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT);
|
|
} else { // s >= size
|
|
weights.append_array( (*weight_list),size);
|
|
}
|
|
}
|
|
total = 0;
|
|
int c = weights.content_size();
|
|
for (int i=0; i<c; ++i){
|
|
total += weights[i];
|
|
}
|
|
}
|
|
|
|
void append(int weight){
|
|
weights.append(weight);
|
|
total += weight;
|
|
}
|
|
|
|
// Give a random number with the consideration of weight.
|
|
// Every random number is associated with a weight.
|
|
// It identifies the chance to be selected,
|
|
// larger weight has more chance to be selected.
|
|
//
|
|
//
|
|
// ______________________ every slot has equal chance
|
|
//
|
|
// [____][_][___][______] each item has different slots, hence different chance
|
|
//
|
|
//
|
|
// The algorithms to generate the number is illustrated by preceding figure.
|
|
// First, a slot is selected by rand(). Then we translate the slot to corresponding item.
|
|
//
|
|
int next(){
|
|
// get a random in [0,1]
|
|
double reference_mark = (double)rand() / (double)RAND_MAX;
|
|
|
|
// get the slot's index, 0 <= mark <= total;
|
|
double mark = total * reference_mark;
|
|
|
|
// translate the slot to corresponding item
|
|
int i=0;
|
|
for (;;){
|
|
mark -= weights[i]; // 0 <= mark <= total
|
|
if (mark <= 0)
|
|
break;
|
|
i++;
|
|
}
|
|
return i;
|
|
}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////
|
|
//
|
|
// The parser result nodes
|
|
//
|
|
|
|
class Literal : public Pick {
|
|
public:
|
|
virtual const char* next(){
|
|
return str;
|
|
}
|
|
Literal(const char * s /*c-string*/){
|
|
str.append_array(s, strlen(s) + 1);
|
|
}
|
|
private:
|
|
Buffer_char str; //null-terminated c-string
|
|
};
|
|
|
|
class Variable : public Pick {
|
|
public:
|
|
Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){
|
|
this->var_name.append_array(varName, strlen(varName) + 1);
|
|
if (symbol_table = symbols){
|
|
symbol_table->put(varName, varRef);
|
|
}
|
|
}
|
|
|
|
operator const char *(){
|
|
return var_name;
|
|
}
|
|
|
|
virtual const char* next(){
|
|
if (symbol_table){
|
|
Pick * var_ref = NULL;
|
|
symbol_table->find(var_name, &var_ref);
|
|
if (var_ref) {
|
|
return var_ref->next();
|
|
}
|
|
}
|
|
return ""; // dumb string
|
|
}
|
|
private:
|
|
Buffer_char var_name;
|
|
SymbolTable * symbol_table;
|
|
};
|
|
|
|
class Quote : public Pick{
|
|
public:
|
|
Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){
|
|
}
|
|
virtual const char* next(){
|
|
return e(item.next());
|
|
}
|
|
private:
|
|
Pick & item;
|
|
Buffer_char str;
|
|
Escaper e;
|
|
};
|
|
|
|
|
|
class Morph : public Pick{
|
|
/*
|
|
The difference between morph and an arbitrary random string is that
|
|
a morph changes slowly. When we build collation rules, for example,
|
|
it is a much better test if the strings we use are all in the same
|
|
'neighborhood'; they share many common characters.
|
|
*/
|
|
public:
|
|
Morph(Pick & base):item(base){}
|
|
|
|
virtual const char* next(){
|
|
current.reset();
|
|
const char * s = item.next();
|
|
current.append_array(s, strlen(s) + 1);
|
|
if (last.content_size() == 0) {
|
|
str.reset();
|
|
last.reset();
|
|
str.append_array(current, current.content_size());
|
|
last.append_array(current, current.content_size());
|
|
} else {
|
|
morph();
|
|
}
|
|
return str;
|
|
}
|
|
private:
|
|
Pick & item;
|
|
Buffer_char str;
|
|
Buffer_char last;
|
|
Buffer_char current;
|
|
|
|
char * p_last;
|
|
char * p_curr;
|
|
|
|
void copy_curr(){
|
|
if (*p_curr) {
|
|
str.append(*p_curr);
|
|
p_curr++;
|
|
}
|
|
}
|
|
|
|
void copy_last(){
|
|
if (*p_last) {
|
|
str.append(*p_last);
|
|
p_last++;
|
|
}
|
|
}
|
|
|
|
// copy 0, 1, or 2 character(s) to str
|
|
void copy(){
|
|
static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5);
|
|
|
|
switch (wr.next()){
|
|
case 0: // copy last -- has 10 times chance than others
|
|
copy_last();
|
|
break;
|
|
case 1: // copy both
|
|
copy_curr();
|
|
copy_last();
|
|
break;
|
|
case 2: // copy both
|
|
copy_last();
|
|
copy_curr();
|
|
break;
|
|
case 3:
|
|
copy_curr();
|
|
break;
|
|
case 4: // copy nothing
|
|
break;
|
|
default:
|
|
// ASSERT(FALSE);
|
|
;
|
|
}
|
|
}
|
|
|
|
void morph(void){
|
|
int min = strlen(last);
|
|
int max = strlen(current);
|
|
if (min > max){
|
|
int temp = min;
|
|
min = max;
|
|
max = temp;
|
|
}
|
|
|
|
int len = min + rand()%(max - min + 1); // min + [0, diff]
|
|
p_curr = current;
|
|
p_last = last;
|
|
str.reset();
|
|
|
|
for (; str.content_size()<len && *p_curr && *p_last;){
|
|
copy(); // copy 0, 1, or 2 character(s) to str
|
|
}
|
|
|
|
if (str.content_size() == len) {
|
|
str.append(0);
|
|
final();
|
|
return;
|
|
}
|
|
|
|
if (str.content_size() > len) { // if the last copy copied two characters
|
|
str[len]=0;
|
|
final();
|
|
return;
|
|
}
|
|
|
|
// str.content_size() < len
|
|
if (*p_last) {
|
|
for (; str.content_size() < len; copy_last());
|
|
} else if (*p_curr){
|
|
for (; str.content_size() < len; copy_curr());
|
|
}
|
|
|
|
int last_len = last.content_size();
|
|
for (;str.content_size() < len;){
|
|
str.append(last[rand()%last_len]);
|
|
}
|
|
str.append(0);
|
|
final();
|
|
}
|
|
|
|
void final(){
|
|
last.reset();
|
|
last.append_array(current, current.content_size());
|
|
}
|
|
};
|
|
|
|
class Sequence : public Pick {
|
|
public:
|
|
virtual const char* next(){
|
|
str.reset();
|
|
int s = items.content_size();
|
|
for(int i=0; i < s; i++){
|
|
const char * t = items[i]->next();
|
|
str.append_array(t, strlen(t));
|
|
}
|
|
str.append(0); // terminal null
|
|
return str;
|
|
}
|
|
|
|
void append (Pick * node){
|
|
items.append(node);
|
|
}
|
|
|
|
virtual ~Sequence(){
|
|
int s = items.content_size();
|
|
for(int i=0; i < s; i++){
|
|
//How can assure the item is got from heap?
|
|
//Let's assume it.
|
|
delete items[i]; // TOFIX: point alias/recursion problem
|
|
items[i] = NULL;
|
|
}
|
|
}
|
|
private:
|
|
Buffer_pPick items;
|
|
Buffer_char str; //null-terminated c-string
|
|
};
|
|
|
|
class Repeat : public Pick {
|
|
private:
|
|
Pick * item;
|
|
Buffer_char str;
|
|
WeightedRand wr;
|
|
int min;
|
|
int max;
|
|
int select_a_count(){
|
|
return min + wr.next();
|
|
}
|
|
public:
|
|
virtual const char* next(){
|
|
str.reset();
|
|
int c = select_a_count();
|
|
for(int i=0; i< c; i++){
|
|
const char * t = item->next();
|
|
str.append_array(t, strlen(t));
|
|
}
|
|
str.append(0);
|
|
return str;
|
|
}
|
|
|
|
Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL):
|
|
wr(weights, maxCount-minCount +1) {
|
|
this->item = base;
|
|
this->min = minCount;
|
|
this->max = maxCount;
|
|
}
|
|
virtual ~Repeat(){
|
|
delete item; // TOFIX: point alias/recursion problem
|
|
item = NULL;
|
|
}
|
|
};
|
|
|
|
|
|
class Alternation : public Pick {
|
|
public:
|
|
virtual const char* next(){
|
|
str.reset();
|
|
int i = wr.next();
|
|
const char * t = items[i]->next();
|
|
str.append_array(t, strlen(t) + 1);
|
|
return str;
|
|
}
|
|
virtual ~Alternation(){
|
|
int s = items.content_size();
|
|
for(int i=0; i < s; i++){
|
|
delete items[i]; // TOFIX: point alias/recursion problem
|
|
items[i] = NULL;
|
|
}
|
|
}
|
|
|
|
Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){
|
|
items.append(node);
|
|
wr.append(weight);
|
|
return *this;
|
|
}
|
|
private:
|
|
Buffer_pPick items;
|
|
Buffer_char str; // null-terminated c-string
|
|
WeightedRand wr;
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////
|
|
//
|
|
// The parser
|
|
//
|
|
|
|
enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT};
|
|
|
|
class Scanner{
|
|
friend int DumpScanner(Scanner & s, UBool dumb);
|
|
private:
|
|
const char *const source;
|
|
const char * working;
|
|
const char * history; // for debug
|
|
enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE};
|
|
StateType state;
|
|
void terminated(TokenType t){
|
|
working--; // return the peeked character
|
|
tokenType = t;
|
|
token.append(0); // close buffer
|
|
state = DONE;
|
|
}
|
|
public:
|
|
// the buffer of "source" is owned by caller
|
|
Scanner(const char *const source/*[in] c-string*/ = NULL):source(source){
|
|
working = source;
|
|
history = working;
|
|
state = DONE;
|
|
tokenType = ERROR;
|
|
}
|
|
|
|
//void setSource(const char *const src /*[in] c-string*/){
|
|
// *(&const_cast<const char *>(source)) = src;
|
|
//}
|
|
|
|
Buffer_char token;
|
|
TokenType tokenType;
|
|
|
|
TokenType getNextToken(){
|
|
token.reset();
|
|
state = START;
|
|
history = working; // for debug
|
|
while (state != DONE){
|
|
char c = *working++;
|
|
if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE
|
|
terminated(ERROR);
|
|
break; // while
|
|
}
|
|
switch(state){
|
|
case START:
|
|
tokenType = ERROR;
|
|
switch(c){
|
|
case '?' : tokenType = QUESTION; break;
|
|
case '*' : tokenType = STAR; break;
|
|
case '+' : tokenType = PLUS; break;
|
|
case '{' : tokenType = LBRACE; break;
|
|
case '}' : tokenType = RBRACE; break;
|
|
case '(' : tokenType = LPAR; break;
|
|
case ')' : tokenType = RPAR; break;
|
|
case ';' : tokenType = SEMI; break;
|
|
case '=' : tokenType = EQ; break;
|
|
case ',' : tokenType = COMMA; break;
|
|
case '|' : tokenType = BAR; break;
|
|
case '@' : tokenType = AT; break;
|
|
case '~' : tokenType = WAVE; break;
|
|
case '%' : tokenType = PERCENT; break;
|
|
case 0 : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break;
|
|
}
|
|
if (tokenType != ERROR){
|
|
token.append(c);
|
|
token.append(0);
|
|
state = DONE;
|
|
break; // START
|
|
}
|
|
switch(c){
|
|
case '$' : state = IN_VAR_FIRST; token.append(c); break;
|
|
case '\'' : state = IN_QUOTE; break;
|
|
case '\\' : state = IN_BSLASH; break;
|
|
default:
|
|
if (isWhiteSpace(c)){ // state = START; //do nothing
|
|
} else if (isDigit(c)){ state = IN_NUM; token.append(c);
|
|
} else if (isAlphabet(c)){ state = IN_STRING; token.append(c);
|
|
} else {terminated(ERROR);}
|
|
}
|
|
break;//START
|
|
case IN_NUM:
|
|
if (isDigit(c)){
|
|
token.append(c);
|
|
} else {
|
|
terminated(NUMBER);
|
|
}
|
|
break;//IN_NUM
|
|
case IN_VAR_FIRST:
|
|
if (isAlphabet(c)){
|
|
token.append(c);
|
|
state = IN_VAR;
|
|
} else {
|
|
terminated(ERROR);
|
|
}
|
|
break; // IN_VAR_FISRT
|
|
case IN_VAR:
|
|
if (isAlphabet(c) || isDigit(c)){
|
|
token.append(c);
|
|
} else {
|
|
terminated(VAR);
|
|
}
|
|
break;//IN_VAR
|
|
case IN_STRING:
|
|
// About the scanner's behavior for STRING, AT, and ESCAPE:
|
|
// All of them can be contacted with each other.
|
|
// This means the scanner will eat up as much as possible strings
|
|
// (STRING, AT, and ESCAPE) at one time, with no regard of their
|
|
// combining sequence.
|
|
//
|
|
if (c == '\''){
|
|
state = IN_QUOTE; // the first time we see single quote
|
|
} else if (c =='\\'){ // back slash character
|
|
state = IN_BSLASH;
|
|
} else if (isAlphabet(c) || isDigit(c)){
|
|
token.append(c);
|
|
} else{
|
|
terminated(STRING);
|
|
}
|
|
break;//IN_STRING
|
|
case IN_QUOTE:
|
|
if (c == '\''){ // the second time we see single quote
|
|
state = IN_STRING; // see document in IN_STRING
|
|
} else if ( c== '\\') { // backslah escape in quote
|
|
state = IN_QUOTE_BSLASH;
|
|
} else {
|
|
token.append(c); // eat up everything, includes back slash
|
|
}
|
|
break;//IN_QUOTE
|
|
case IN_QUOTE_BSLASH:
|
|
case IN_BSLASH:
|
|
switch (c){
|
|
case 'n' : token.append('\n'); break;
|
|
case 'r' : token.append('\r'); break;
|
|
case 't' : token.append('\t'); break;
|
|
case '\'' : token.append('\''); break;
|
|
case '\\' : token.append('\\'); break;
|
|
default: token.append(c); // unknown escaping, treat it as literal
|
|
}
|
|
if (state == IN_BSLASH){
|
|
state = IN_STRING; // see document in IN_STRING
|
|
} else { // state == IN_QUOTE_BSLASH
|
|
state = IN_QUOTE;
|
|
}
|
|
break;//IN_BSLASH
|
|
case DONE: /* should never happen */
|
|
default:
|
|
working--;
|
|
tokenType = ERROR;
|
|
state = DONE;
|
|
break;
|
|
}//switch(state)
|
|
}//while (state != DONE)
|
|
|
|
return tokenType;
|
|
}
|
|
};//class Scanner
|
|
|
|
class Parser{
|
|
private:
|
|
Scanner s;
|
|
TokenType & token;
|
|
int min_max; // for the evil infinite
|
|
|
|
UBool match(TokenType expected){
|
|
if (token == expected) {
|
|
token = s.getNextToken();
|
|
return TRUE;
|
|
} else {
|
|
//s.dumpCurrentPoint();
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
UBool weight(int & value){
|
|
if (token == NUMBER){
|
|
int temp = atoi(s.token);
|
|
match(NUMBER);
|
|
if (match(PERCENT)){
|
|
value = temp;
|
|
return TRUE;
|
|
}
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
UBool repeat (Pick* &node /*in,out*/){
|
|
if (node == NULL) return FALSE;
|
|
|
|
int count = -2;
|
|
int min = -2;
|
|
int max = -2;
|
|
UBool question = FALSE;
|
|
switch (token){
|
|
case QUESTION:
|
|
match(QUESTION);
|
|
min = 0;
|
|
max = 1;
|
|
count = 2;
|
|
question = TRUE;
|
|
break;
|
|
case STAR:
|
|
match(STAR);
|
|
min = 0;
|
|
max = -1;
|
|
count = -1;
|
|
break;
|
|
case PLUS:
|
|
match(PLUS);
|
|
min = 1;
|
|
max = -1;
|
|
count = -1;
|
|
break;
|
|
case LBRACE:
|
|
match(LBRACE);
|
|
if (token != NUMBER){
|
|
return FALSE;
|
|
}else {
|
|
min = atoi(s.token);
|
|
match(NUMBER);
|
|
if (token == RBRACE){
|
|
match(RBRACE);
|
|
max = min;
|
|
count = 1;
|
|
} else if (token == COMMA) {
|
|
match(COMMA);
|
|
if (token == RBRACE){
|
|
match(RBRACE);
|
|
max = -1;
|
|
count = -1;
|
|
} else if (token == NUMBER) {
|
|
max = atoi(s.token);
|
|
match(NUMBER);
|
|
count = max - min + 1;
|
|
if (!match(RBRACE)) {
|
|
return FALSE;
|
|
}
|
|
} else {
|
|
return FALSE;
|
|
}
|
|
} else {
|
|
return FALSE;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
return FALSE;
|
|
}
|
|
|
|
if (count == -2 || min == -2 || max == -2){
|
|
//ASSERT(FALSE);
|
|
return FALSE;
|
|
}
|
|
|
|
// eat up following weights
|
|
Buffer_int weights;
|
|
int w;
|
|
while (weight(w)){
|
|
weights.append(w);
|
|
}
|
|
|
|
// for the evil infinite
|
|
min_max = min_max > min ? min_max : min;
|
|
min_max = min_max > max ? min_max : max;
|
|
if (min_max > PSEUDO_INFINIT){
|
|
return FALSE; // PSEUDO_INFINIT is less than the real maximum
|
|
}
|
|
if (max == -1){ // the evil infinite
|
|
max = PSEUDO_INFINIT;
|
|
}
|
|
// for the strange question mark
|
|
if (question && weights.content_size() > 0){
|
|
Buffer_int w2;
|
|
w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]);
|
|
node = new Repeat(node,min,max,&w2);
|
|
return TRUE;
|
|
}
|
|
node = new Repeat(node,min,max,&weights);
|
|
return TRUE;
|
|
}
|
|
|
|
UBool core(Pick* &node /*out*/){
|
|
if (node != NULL) return FALSE; //assert node == NULL
|
|
|
|
switch(token){
|
|
case LPAR:
|
|
match(LPAR);
|
|
if(defination(node) && match(RPAR)){
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
case VAR:
|
|
node = new Variable(&symbols, s.token);
|
|
match(VAR);
|
|
return TRUE;
|
|
case STRING:
|
|
node = new Literal(s.token);
|
|
match(STRING);
|
|
return TRUE;
|
|
default:
|
|
return FALSE;
|
|
}
|
|
}
|
|
UBool modified(Pick* &node /*out*/){
|
|
if (node != NULL) return FALSE; //assert node == NULL
|
|
|
|
if (!core(node)) {
|
|
return FALSE;
|
|
}
|
|
|
|
for (;;){
|
|
switch(token){
|
|
case WAVE:
|
|
match(WAVE);
|
|
node = new Morph(*node);
|
|
break;
|
|
case AT:
|
|
match(AT);
|
|
node = new Quote(*node);
|
|
break;
|
|
case QUESTION:
|
|
case STAR:
|
|
case PLUS:
|
|
case LBRACE:
|
|
if (!repeat(node)) return FALSE;
|
|
break;
|
|
case SEMI: // rule definiation closed
|
|
case RPAR: // within parenthesis (core closed)
|
|
case BAR: // in alternation
|
|
case NUMBER: // in alternation, with weight
|
|
case LPAR: // in sequence
|
|
case VAR: // in sequence
|
|
case STRING: // in sequence
|
|
return TRUE;
|
|
default:
|
|
return FALSE;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
UBool sequence_list(Pick* &node /*in,out*/){
|
|
if (node == NULL) return FALSE; // assert node != NULL
|
|
|
|
Sequence* seq = new Sequence();
|
|
Pick * n = node;
|
|
|
|
while (token == VAR || token == STRING || token == LPAR){
|
|
seq->append(n);
|
|
n = NULL;
|
|
if (modified(n)){
|
|
// go on
|
|
} else {
|
|
goto FAIL;
|
|
}
|
|
}
|
|
|
|
if (token == SEMI || token == RPAR || token == BAR){
|
|
seq->append(n);
|
|
node = seq;
|
|
return TRUE;
|
|
}
|
|
FAIL:
|
|
delete seq;
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
UBool sequence(Pick* &node /*out*/){
|
|
if (node != NULL) return FALSE; //assert node == NULL
|
|
|
|
if (!modified(node)) {
|
|
return FALSE;
|
|
}
|
|
|
|
if (token == VAR || token == STRING || token == LPAR){
|
|
return sequence_list(node);
|
|
} else {
|
|
return TRUE; // just a modified
|
|
}
|
|
}
|
|
|
|
UBool alternation_list(Pick* &node /*in,out*/){
|
|
if (node == NULL) return FALSE; // assert node != NULL
|
|
|
|
Alternation * alt = new Alternation();
|
|
Pick * n = node;
|
|
int w = DEFAULT_WEIGHT;
|
|
|
|
while (token == NUMBER || token == BAR){
|
|
if(token == NUMBER) {
|
|
if (weight(w)){
|
|
if (token == BAR){
|
|
// the middle item, go on
|
|
} else {
|
|
// the last item or encounter error
|
|
break; //while
|
|
}
|
|
} else {
|
|
goto FAIL;
|
|
}
|
|
} // else token == BAR
|
|
match(BAR);
|
|
alt->append(n,w);
|
|
|
|
n = NULL;
|
|
w = DEFAULT_WEIGHT;
|
|
if (sequence(n)){
|
|
// go on
|
|
} else {
|
|
goto FAIL;
|
|
}
|
|
}
|
|
|
|
if (token == SEMI || token == RPAR) {
|
|
alt->append(n,w);
|
|
node = alt;
|
|
return TRUE;
|
|
}
|
|
FAIL:
|
|
delete alt;
|
|
return FALSE;
|
|
}
|
|
|
|
UBool alternation(Pick* &node /*out*/){
|
|
if (node != NULL) return FALSE; //assert node == NULL
|
|
|
|
// 'sequence' has higher precedence than 'alternation'
|
|
if (!sequence(node)){
|
|
return FALSE;
|
|
}
|
|
|
|
if (token == BAR || token == NUMBER){ // find a real alternation1, create it.
|
|
return alternation_list(node);
|
|
} else {
|
|
return TRUE; // just a sequence_old
|
|
}
|
|
}
|
|
|
|
|
|
UBool defination(Pick* &node /*out*/){
|
|
if (node != NULL) return FALSE; //assert node == NULL
|
|
return alternation(node);
|
|
}
|
|
|
|
UBool rule(){
|
|
if (token == VAR){
|
|
Buffer_char name;
|
|
name.append_array(s.token, strlen(s.token) + 1);
|
|
match(VAR);
|
|
|
|
if (match(EQ)){
|
|
Pick * t = NULL;
|
|
if(defination(t)){
|
|
symbols.put(name, t);
|
|
return match(SEMI);
|
|
}
|
|
}
|
|
}
|
|
return FALSE;
|
|
}
|
|
public:
|
|
UBool rules(){
|
|
symbols.reset();
|
|
token = s.getNextToken();
|
|
while (rule()){
|
|
}
|
|
if (token == STREAM_END){
|
|
return TRUE;
|
|
} else {
|
|
//s.dumpCurrentPoint();
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
friend UBool TestParser();
|
|
friend class LanguageGenerator_impl;
|
|
public:
|
|
SymbolTable symbols;
|
|
|
|
Parser(const char *const source):s(source), token(s.tokenType){
|
|
min_max = -2;
|
|
}
|
|
UBool parse(){
|
|
return rules();
|
|
}
|
|
|
|
}; // class Parser
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
//
|
|
//
|
|
//
|
|
|
|
class LanguageGenerator_impl{
|
|
public:
|
|
LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node)
|
|
:par(bnf_definition), top_node_name(top_node){
|
|
srand((unsigned)time( NULL ));
|
|
};
|
|
|
|
LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){
|
|
if (par.parse()){
|
|
if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) {
|
|
if (par.symbols.is_complete()) {
|
|
return LanguageGenerator::OK;
|
|
} else {
|
|
if (debug) printf("The bnf definition is incomplete.\n");
|
|
return LanguageGenerator::INCOMPLETE;
|
|
}
|
|
} else {
|
|
if (debug) printf("No top node is found.\n");
|
|
return LanguageGenerator::NO_TOP_NODE;
|
|
}
|
|
} else {
|
|
if(debug) {
|
|
printf("The bnf definition is wrong\n");
|
|
DumpScanner(par.s, TRUE);
|
|
}
|
|
return LanguageGenerator::BNF_DEF_WRONG;
|
|
}
|
|
}
|
|
const char * next(){
|
|
return top_node_ref->next();
|
|
};
|
|
|
|
private:
|
|
Parser par;
|
|
const char *const top_node_name;
|
|
Pick * top_node_ref;
|
|
};
|
|
|
|
LanguageGenerator::LanguageGenerator():lang_gen(NULL){
|
|
}
|
|
|
|
LanguageGenerator::~LanguageGenerator(){
|
|
delete lang_gen;
|
|
}
|
|
|
|
LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){
|
|
if (lang_gen){
|
|
delete lang_gen;
|
|
}
|
|
lang_gen = new LanguageGenerator_impl(bnf_definition, top_node);
|
|
PARSE_RESULT r = lang_gen->parseBNF(debug);
|
|
if (r != OK){
|
|
delete lang_gen;
|
|
lang_gen = NULL;
|
|
return r;
|
|
} else {
|
|
return r;
|
|
}
|
|
}
|
|
const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee.
|
|
if (lang_gen){
|
|
return lang_gen->next();
|
|
}else {
|
|
return "";
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////
|
|
//
|
|
// The test code for WBNF
|
|
//
|
|
|
|
#include "wbnftest.h"
|