scuffed-code/tools/colprobe/colprobe.cpp

/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*
* File colprobe.cpp
*
* Modification History:
*
*   Date        Name        Description
*   03/18/2003  weiv        Creation.
*******************************************************************************
*/

#include "uoptions.h"
#include "unicode/ucol.h"
#include "unicode/ucoleitr.h"
#include "unicode/ures.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
#include "uprops.h"
#include "hash.h"
#include "ucol_imp.h"

#include "unicode/ustdio.h"
#include "unicode/utrans.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <io.h>
#include <fcntl.h>

#include "colprobe.h"


#ifdef WIN32
#include <windows.h>
#else
//
//  Stubs for Windows API functions when building on UNIXes.
//
typedef int DWORD;
inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
#include <sys/time.h>
unsigned long timeGetTime() {
    struct timeval t;
    gettimeofday(&t, 0);
    unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
    val += t.tv_usec / 1000;
    return val;
};
inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
const int LCMAP_SORTKEY = 0;
#define MAKELCID(a,b) 0
const int SORT_DEFAULT = 0;
#endif

#include "line.h"

static UBool gVerbose = FALSE;
static UBool gDebug = FALSE;
static UBool gQuiet = FALSE;
static UBool gExemplar = FALSE;

DWORD          gWinLCID;
int            gCount;
Line          **gICULines;
UCollator     *gCol;
UCollator     *gUCA;
Line          source;
Line          target;
Line          *gSource = &source;
Line          *gTarget = &target;
Hashtable     gElements(FALSE);
Hashtable     gExpansions(FALSE);
CompareFn gComparer;

const UChar separatorChar = 0x0030;

UFILE *out = NULL;
UFILE *err = NULL;
UFILE *log = NULL; 

const char *progName = "colprobe";

const char *gLocale = NULL;
//char platform[256];
int32_t platformIndex = -1;
int32_t gPlatformNo = 0;
int32_t gPlatformIndexes[10];
int32_t gLocaleNo = 0;
const char* gLocales[100];
UBool gRulesStdin = FALSE;

enum {
  HELP1,
    HELP2,
    VERBOSE,
    QUIET,
    VERSION,
    ICUDATADIR,
    COPYRIGHT,
    LOCALE,
    PLATFORM,
    DEBUG, 
    EXEMPLAR,
    RULESSTDIN
};

UOption options[]={
  /*0*/ UOPTION_HELP_H,
  /*1*/ UOPTION_HELP_QUESTION_MARK,
  /*2*/ UOPTION_VERBOSE,
  /*3*/ UOPTION_QUIET,
  /*4*/ UOPTION_VERSION,
  /*5*/ UOPTION_ICUDATADIR,
  /*6*/ UOPTION_COPYRIGHT,
  /*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG),
  /*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG),
  /*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG),
  /*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG),
  /*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG)
};

int Winstrcmp(const void *a, const void *b) {
    gCount++;
    int t;
    t = CompareStringW(gWinLCID, 0, 
      (*(Line **)a)->name, (*(Line **)a)->len, 
      (*(Line **)b)->name, (*(Line **)b)->len);
    return t-2;
}

int ICUstrcmp(const void *a, const void *b) {
    gCount++;
    UCollationResult t;
    t = ucol_strcoll(gCol, 
      (*(Line **)a)->name, (*(Line **)a)->len,  
      (*(Line **)b)->name, (*(Line **)b)->len);
    if (t == UCOL_LESS) return -1;
    if (t == UCOL_GREATER) return +1;
    return 0;
}

struct {
  const char* name;
  CompareFn comparer;
} platforms[] = {
  { "icu", ICUstrcmp },
  { "win", Winstrcmp}
};


void deleteLineElement(void *line) {
  delete((Line *)line);
}

void stringToLower(char *string) {
  uint32_t i = 0;
  for(i = 0; i < strlen(string); i++) {
    string[i] = tolower(string[i]);
  }
}

void usage(const char *name) {
  u_fprintf(out, "Usage: %s --locale loc_name --platform platform\n", name);
}

void listKnownPlatforms() {
  int32_t i = 0;
  u_fprintf(err, "Known platforms:\n");
  for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
    u_fprintf(err, "\t%s\n", platforms[i]);
  }
}

void addPlatform(const char *platform) {
  int32_t i;
  //stringToLower(platform);
  int32_t oldPlatformNo = gPlatformNo;

  for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
    if(strcmp(platform, platforms[i].name) == 0) {
      gPlatformIndexes[gPlatformNo++] = i;
    }
  }
  if(gPlatformNo == oldPlatformNo) {
    u_fprintf(err, "Unknown platform %s\n", platform);
    listKnownPlatforms();
  }
}

void processArgs(int argc, char* argv[], UErrorCode &status)
{
  int32_t i = 0;
  U_MAIN_INIT_ARGS(argc, argv);

  argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);

  if(argc < 0) {
    u_fprintf(err, "Unknown option: %s\n", argv[-argc]);
    usage(progName);
    return;
  }

  if(options[0].doesOccur || options[1].doesOccur) {
    usage(progName);
    return;
  }
  if(options[VERBOSE].doesOccur) {
    gVerbose = TRUE;
  }
  if(options[DEBUG].doesOccur) {
    gDebug = TRUE;
    gVerbose = TRUE;
  }
  if(options[EXEMPLAR].doesOccur) {
    gExemplar = TRUE;
  }
  if(options[QUIET].doesOccur) {
    gQuiet = TRUE;
  }
/*
  for(i = 8; i < 9; i++) {
    if(!options[i].doesOccur) {
      u_fprintf(err, "Option %s is required!\n", options[i].longName);
      usage(progName);
      status = U_ILLEGAL_ARGUMENT_ERROR;
    }
    if(options[i].value == NULL) {
      u_fprintf(err, "Option %s needs an argument!\n", options[i].longName);
      usage(progName);
      status = U_ILLEGAL_ARGUMENT_ERROR;
    }
  }
*/
  // ASCII based options specified on the command line
  // this is for testing purposes, will allow to load
  // up ICU rules and then poke through them.
  // In that case, we test only ICU and don't need 
  // a locale.
  if(options[RULESSTDIN].doesOccur) {
    gRulesStdin = TRUE;
    addPlatform("icu");
    return;
  } 

  if(options[LOCALE].doesOccur) {
    gLocale = options[LOCALE].value;
  } else {
    for(i = 1; i < argc; i++) {
      gLocales[gLocaleNo++] = argv[i];
    }
  }
  if(options[PLATFORM].doesOccur) {
    //strcpy(platform, options[PLATFORM].value);
    //addPlatform("icu");
    addPlatform(options[PLATFORM].value);
  } else { // there is a list of platforms 
    u_fprintf(err, "Option %s is required!\n", options[i].longName);
    usage(progName);
    status = U_ILLEGAL_ARGUMENT_ERROR;
  }

  //
  //  Set up a Windows LCID
  //
  gWinLCID = uloc_getLCID(gLocale);
  /*
  if (gLocale != 0) {
      gWinLCID = MAKELCID(gLocale, SORT_DEFAULT);
  }
  else {
      gWinLCID = uloc_getLCID(gLocale);
  }
  */

}

void printRules(const UChar *name, int32_t len, UFILE *file) {
  // very rudimentary pretty rules print
  int32_t i = 0;
  UChar toPrint[16384];
  int32_t toPrintIndex = 0;
  for(i = 0; i < len; i++) {
    if(name[i] == 0x0026) {
      if(toPrintIndex) {
        toPrint[toPrintIndex] = 0;
        u_fprintf(file, "%U\n", toPrint);
        toPrintIndex = 0;
        toPrint[toPrintIndex++] = name[i];
      } else {
        toPrint[toPrintIndex++] = name[i];
      } 
    } else {
      toPrint[toPrintIndex++] = name[i];
    }
  }
  if(toPrintIndex) {
    toPrint[toPrintIndex] = 0;
    u_fprintf(file, "%U\n", toPrint);
    toPrintIndex = 0;
  }


}

void escapeString(const UChar *name, int32_t len, UFILE *file) {
  u_fprintf(file, "%U", name);
/*
  int32_t j = 0;
  for(j = 0; j < len; j++) {
    if(name[j] >= 0x20 && name[j] < 0x80) {
      u_fprintf(file, "%c", name[j]);
    } else {
      u_fprintf(file, "\\u%04X", name[j]);
    }
  }
*/
}
void escapeALine(Line *line, UFILE *file) {
  escapeString(line->name, line->len, file);
}

void escapeExpansion(Line *line, UFILE *file) {
  escapeString(line->expansionString, line->expLen, file);
}

void showNames(Line *line, UFILE *file) {
  UErrorCode status = U_ZERO_ERROR;
  int32_t j = 0;
  char charName[256];
  for(j = 0; j < line->len; j++) {
    u_charName(line->name[j], U_EXTENDED_CHAR_NAME, charName, 256, &status);
    u_fprintf(file, "%s ", charName);
  }
}

void setArray(Line **array, Line *contents, int32_t size) {
  int32_t i = 0;
  for(i = 0; i < size; i++) {
    array[i] = contents+i;
  }
}

// set an array from a Hashtable
int32_t 
setArray(Line **array, Hashtable *table = &gElements) {
  int32_t size = table->count();
  int32_t hashIndex = -1;
  const UHashElement *hashElement = NULL;
  int32_t count = 0;
  while((hashElement = table->nextElement(hashIndex)) != NULL) {
    array[count++] = (Line *)hashElement->value.pointer;
  }
  return size;
}

UBool trySwamped(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
  u_strcpy(gSource->name, (*smaller)->name);
  gSource->name[(*smaller)->len] = separatorChar;
  gSource->name[(*smaller)->len+1] = chars[0];
  gSource->name[(*smaller)->len+2] = 0;
  gSource->len = (*smaller)->len+2;

  u_strcpy(gTarget->name, (*greater)->name);
  gTarget->name[(*greater)->len] = separatorChar;
  gTarget->name[(*greater)->len+1] = chars[1];
  gTarget->name[(*greater)->len+2] = 0;
  gTarget->len = (*greater)->len+2;

  if(comparer(&gSource, &gTarget) > 0) {
    return TRUE;
  } else {
    return FALSE;
  }
}

UBool trySwamps(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
  gSource->name[0] = chars[0];
  gSource->name[1] = separatorChar;
  u_strcpy(gSource->name+2, (*smaller)->name);
  gSource->len = (*smaller)->len+2;

  gTarget->name[0] = chars[1];
  gTarget->name[1] = separatorChar;
  u_strcpy(gTarget->name+2, (*greater)->name);
  gTarget->len = (*greater)->len+2;

  if(comparer(&gSource, &gTarget) < 0) {
    return TRUE;
  } else {
    return FALSE;
  }
}

UColAttributeValue 
probeStrength(Line** prevLine, Line **currLine, CompareFn comparer) {
  // Primary swamps secondary
  // have pairs where [0] 2> [1]
  UChar primSwamps[][2] = {
    { 0x00E0, 0x0061 },
    { 0x0450, 0x0435 },
    { 0x31a3, 0x310d }
  };
  // Secondary swamps tertiary
  // have pairs where [0] 3> [1]
  UChar secSwamps[][2] = {
    { 0x0053, 0x0073 },
    { 0x0415, 0x0435 },
    { 0x31b6, 0x310e }
  };
  // Secondary is swamped by primary
  // have pairs where [0] 1> [1]
  UChar secSwamped[][2] = {
    { 0x0062, 0x0061 },
    { 0x0436, 0x0454 },
    { 0x310e, 0x310d }
  };
  // Tertiary is swamped by secondary
  // have pairs where [0] 2> [1]
  UChar terSwamped[][2] = {
    { 0x00E0, 0x0061 },
    { 0x0450, 0x0435 },
    { 0x31a3, 0x310d }
  };
  int32_t i = 0;
  // Tertiary swamps equal?
  int result = 0;
  // Choose the pair
  i = 0;
  /*
  if((*prevLine)->name[0] > 0xFF && (*currLine)->name[0] > 0xFF) {
    i = 0;
  } else if((*prevLine)->name[0] < 0x0400 && (*currLine)->name[0] < 0x0400) {
    i = 1;
  } else {
    i = 2;
  }
  */
  // are they equal?
  if((result = comparer(prevLine, currLine)) == 0) {
    return UCOL_IDENTICAL;
  } else if(result > 0) {
    //fprintf(stderr, "lines should be ordered!");
    return UCOL_OFF;
  } else if(trySwamps(prevLine, currLine, primSwamps[i], comparer)) {
    return UCOL_PRIMARY;
  } else if(trySwamps(prevLine, currLine, secSwamps[i], comparer)) {
    return UCOL_SECONDARY;
  } else if(trySwamped(prevLine, currLine, terSwamped[i], comparer)) {
    // is there a tertiary difference
    return UCOL_TERTIARY;
  } else {
    //fprintf(stderr, "Unknown strength!\n");
    return UCOL_ON;
  }
}

// This function tries to probe the set of lines 
// (already sorted by qsort) and deduct the strengths
void 
analyzeStrength(Line **lines, int32_t size, CompareFn comparer) {
  int32_t i = 0;

  for(i = 1; i < size; i++) {
    Line **prevLine = lines+i-1;
    Line **currLine = lines+i;
    (*currLine)->strength = probeStrength(prevLine, currLine, comparer);
    (*currLine)->sortedIndex = i;
    (*currLine)->previous = *prevLine;
    (*prevLine)->next = *currLine;

  }

}

void printStrength(UColAttributeValue strength, UFILE *file) {
  u_fprintf(file, " ");
  switch(strength) {
  case UCOL_IDENTICAL:
    u_fprintf(file, "=");
    break;
  case UCOL_TERTIARY:
    //u_fprintf(file, "<3");
    u_fprintf(file, "<<<");
    break;
  case UCOL_SECONDARY:
    //u_fprintf(file, "<2");
    u_fprintf(file, "<<");
    break;
  case UCOL_PRIMARY:
    //u_fprintf(file, "<1");
    u_fprintf(file, "<");
    break;
  case UCOL_OFF:
    u_fprintf(file, ">?");
  default:
    u_fprintf(file, "?!");
    break;
  }
  u_fprintf(file, " ");
}

void printStrength(Line *line, UFILE *file) {
  printStrength(line->strength, file);
}

void printLine(Line *line, UFILE *file) {
  escapeALine(line, file);
  if(line->isExpansion) {
    u_fprintf(file, "/");
    escapeExpansion(line, file);
  }
}

void printOrdering(Line **lines, int32_t size, UFILE *file, UBool useLinks = FALSE) {
  int32_t i = 0;

  //printLine(*lines);
  //escapeALine(*lines); // Print first line

  Line *line = NULL;
  Line *previous = *lines;
  if(previous->isReset) {
    u_fprintf(file, "\n& ");
    escapeALine(previous, file);
  } else if(!previous->isRemoved) {
    printLine(previous, file);
  }
  i = 1;
  while(i < size && previous->next) {
    if(useLinks) {
      line = previous->next;
    } else {
      line = *(lines+i);
    }
    if(line->isReset) {
      u_fprintf(file, "\n& ");
      escapeALine(line, file);
    } else if(!line->isRemoved) {
      if(file == out) {
        u_fprintf(file, "\n");
      }
      if(i > 0) {
        printStrength(line, file);
      }
      printLine(line, file);
      //escapeALine(line, file);
    }
    previous = line;
    i++;
  }
  u_fprintf(file, "\n");
}


void setIndexes(Line **lines, int32_t size) {
  int32_t i = 0;
  (*lines)->sortedIndex = 0;
  for(i = 1; i < size; i++) {
    Line *line = *(lines+i);
    Line *prev = *(lines+i-1);
    line->previous = prev;
    prev->next = line;
    line->sortedIndex = i;
  }
}


// this seems to be a dead end
void
noteExpansion(Line **gLines, Line *line, int32_t size, CompareFn comparer) {
  UErrorCode status = U_ZERO_ERROR;

  UnicodeString key(line->name, line->len);
  //Line *toInsert = (Line *)gElements.get(key);
  Line *toInsert = (Line *)gExpansions.get(key);
  if(toInsert != NULL) {
    toInsert->isExpansion = TRUE;
    u_strcpy(toInsert->expansionString, line->expansionString);
    toInsert->expLen = line->expLen;
    toInsert->previous->next = toInsert->next;
    toInsert->next->previous = toInsert->previous;
    gElements.remove(key);
  } else {
    toInsert = new Line(*line); 
    toInsert->isExpansion = TRUE;
    gElements.put(UnicodeString(toInsert->name, toInsert->len), toInsert, status);
  }

  int32_t i = 0;
  Line testLine;
  Line *l = &testLine;
  for(i = 0; i < size; i++) {
    u_strcpy(testLine.name, (*(gLines+i))->name);
    u_strcat(testLine.name, line->expansionString);
    testLine.len = (*(gLines+i))->len + line->expLen;
    if(comparer(&l, &line) > 0) {
      toInsert->previous = *(gLines+i-1);
      toInsert->next = *(gLines+i);
      toInsert->previous->next = toInsert;
      toInsert->next->previous = toInsert;
      break;
    }
  }
  if(gVerbose) {
    u_fprintf(log, "Adding expansion\n");
    escapeALine(line, log);
    u_fprintf(log, "/");
    escapeExpansion(line, log);
    u_fprintf(log, " ");
  }
}

void
positionExpansions(Line **gLines, int32_t size, CompareFn comparer) {
  int result = 0;
  Line *line = NULL;
  Line *toMove = NULL;
  int32_t i = 0, j = 0;
  Line **sortedExpansions = new Line*[gExpansions.count()];
  int32_t sortedExpansionsSize = setArray(sortedExpansions, &gExpansions);
  qsort(sortedExpansions, sortedExpansionsSize, sizeof(Line *), comparer);
  // Make a list of things in the vincinity of expansion candidate
  for(j = 0; j < sortedExpansionsSize; j++) {
    line = *(sortedExpansions+j);
    UnicodeString key(line->name, line->len);
    toMove = (Line *)gElements.get(key);
    int32_t i = 0;
    Line testLine, prevTestLine;
    Line *l = &testLine;
    Line *prevL = &prevTestLine;
    // This can be further optimized, since we now know that we have a 
    // sorted list of expansions, so current can start from toMove, since all
    // the elements before it are already smaller. In the beggining it needs to 
    // be on gLines, though.
    Line *current = *gLines;
    while(current) {
      if(current == toMove) {
        // we are wading through a sorted list
        // if we found ourselves, it means that we 
        // are already in a right place, so no moving
        // is needed, but we need to make sure we have
        // the right strength.
        toMove->strength = probeStrength(&prevL, &toMove, comparer);
        if(0) {
          u_fprintf(log, "Positioned expansion without moving ");
          printLine(toMove, log);
          u_fprintf(log, " new ordering: \n");
          printOrdering(gLines, size, log, TRUE);
        }
        break;
      } else {
        u_strcpy(testLine.name, current->name);
        if(!current->isExpansion) {
          u_strcat(testLine.name, line->expansionString);
          testLine.len = current->len + line->expLen;
        } else {
          testLine.len = current->len;
        }
        if(comparer(&l, &line) > 0) {
          // remove from chain
          if(toMove->next) {
            toMove->next->strength = probeStrength(&(toMove->previous), &(toMove->next), comparer);
            toMove->next->previous = toMove->previous;
          }
          if(toMove->previous) {
            toMove->previous->next = toMove->next;
          }

          // insert
          toMove->previous = current->previous;
          toMove->next = current;

          if(current->previous) {
            current->previous->next = toMove;
          }
          current->previous = toMove;

          toMove->strength = probeStrength(&prevL, &toMove, comparer);
          toMove->next->strength = probeStrength(&toMove, &l, comparer);
          if(0) {
            u_fprintf(log, "Positioned expansion ");
            printLine(toMove, log);
            u_fprintf(log, " new ordering: \n");
            printOrdering(gLines, size, log, TRUE);
          }
          if(toMove->strength == UCOL_IDENTICAL) {
            // check for craziness such as s = ss/s
            // such line would consist of previous (or next) concatenated with the expansion value
            // make a test
            UChar fullString[256];
            u_strcpy(fullString, toMove->previous->name);
            u_strcat(fullString, toMove->expansionString);
            if(u_strcmp(fullString, toMove->name) == 0) {
              toMove->previous->next = toMove->next;
              toMove->next->previous = toMove->previous;
              toMove->isRemoved = TRUE;
              u_fprintf(log, "Removed: ");
              printLine(toMove, log);
              u_fprintf(log, "\n");
            } 
          } else if(toMove->next->strength == UCOL_IDENTICAL) {
            UChar fullString[256];
            u_strcpy(fullString, toMove->next->name);
            u_strcat(fullString, toMove->expansionString);
            if(u_strcmp(fullString, toMove->name) == 0) {
              toMove->next->strength = toMove->strength;
              toMove->previous->next = toMove->next;
              toMove->next->previous = toMove->previous;
              toMove->isRemoved = TRUE;
              u_fprintf(log, "Removed because of back: ");
              printLine(toMove, log);
              u_fprintf(log, "\n");
            } 
          }
          break;
        }
        prevTestLine = testLine;
      }
      current = current->next;
    }
  }
  delete[] sortedExpansions;
}


void
noteExpansion(Line *line) {
  UErrorCode status = U_ZERO_ERROR;
  UnicodeString key(line->name, line->len);
  Line *el = (Line *)gElements.get(key);
  if(el != NULL) {
    el->isExpansion = TRUE;
    u_strcpy(el->expansionString, line->expansionString);
    el->expLen = line->expLen;
  } else {
    Line *toInsert = new Line(*line); 
    toInsert->isExpansion = TRUE;
    gElements.put(UnicodeString(line->name, line->len), toInsert, status);
  }

  Line *el2 = (Line *)gExpansions.get(key);
  el2->isExpansion = TRUE;
  u_strcpy(el2->expansionString, line->expansionString);
  el2->expLen = line->expLen;

  if(gDebug) {
    u_fprintf(log, "Adding expansion\n");
    printLine(line, log);
    u_fprintf(log, "\n");
  }
}

void 
noteContraction(Line *line) {
  UErrorCode status = U_ZERO_ERROR;
  Line *toInsert = new Line(*line); 
  toInsert->isContraction = TRUE;
  gElements.put(UnicodeString(line->name, line->len), toInsert, status);
  if(gVerbose) {
    u_fprintf(log, "Adding contraction\n");
    escapeALine(line, log);
    u_fprintf(log, " ");
  }
}

void
noteElement(Line *line) {
  UErrorCode status = U_ZERO_ERROR;
  Line *toInsert = new Line(*line);
  gElements.put(UnicodeString(line->name, line->len), toInsert, status);
  if(0) { //if(gDebug) 
    escapeALine(line, log);
    u_fprintf(log, " ");
  }
}


// This function checks if a combination of characters has changed place with the 
// adjacent elements. If so, these are most probably contractions.
// However, it still needs to be checked if these contractions are fake - the 
// test is simple - if xy is suspected contraction, if we get that x/y is expansion, then
// xy is a fake contraction.
int32_t 
analyzeContractions(Line** lines, int32_t size, CompareFn comparer) {
  int32_t i = 0, j = 0;
  int32_t outOfOrder = 0;
  UColAttributeValue strength = UCOL_OFF;
  UColAttributeValue currStrength = UCOL_OFF;
  Line **prevLine = lines;
  Line **currLine = NULL;
  Line **backupLine = NULL;
  UBool prevIsContraction = FALSE, currIsContraction = FALSE;
  // Problem here is detecting a contraction that is at the very end of the sorted list
  for(i = 1; i < size; i++) {
    currLine = lines+i;
    strength = probeStrength(prevLine, currLine, comparer);
    if(strength == UCOL_OFF || strength != (*currLine)->strength) {
      prevIsContraction = FALSE;
      currIsContraction = FALSE;
      if(!outOfOrder) {
        if(gVerbose) {
          u_fprintf(log, "Possible contractions: ");
        }
      }
      // now we have two elements that are different. The question is, 
      // which one of them is the contraction - which one has moved. 
      // Could be the previous, but could also be the current.

      outOfOrder++;

      // First, lets check whether the previous has jumped back
      j = i+1;
      // skip all the nexts that have smaller strength, they don't have an effect
      while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
        j++;
      }
      // check if there are other elements of same or greater strength
      while(j < size && 
        (strength = probeStrength(prevLine, (backupLine = lines+j), comparer)) == UCOL_OFF) {
        j++;
        // if we skipped more than one, it might be in fact a contraction
        prevIsContraction = TRUE;
      }
      if(prevIsContraction) {
        noteContraction(*prevLine);
        j = i-2;
        // add all the previous elements with smaller strength, since they also
        // will jump over and are contractions
        while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
          strength = probeStrength(lines+j, currLine, comparer);
          if(strength == UCOL_OFF) {
            noteContraction(*(lines+j));
          }
          j--;
        }
      }

      // now we check if the current element is jumping forward,
      // the dance steps are analogous to above.
      j = i - 2;
      while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
        j--;
      }
      while(j >= 0 && 
        (strength = probeStrength((backupLine = lines+j), currLine, comparer)) == UCOL_OFF) {
        j--;
        currIsContraction = TRUE;
      }
      if(currIsContraction) {
        if(gVerbose) {
          escapeALine(*currLine, log);
          u_fprintf(log, " ");
        }
        j = i+1;
        while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
          strength = probeStrength(prevLine, lines+j, comparer);
          if(strength == UCOL_OFF) {
            noteContraction(*(lines+j));
          }
          j++;
        }
      }

      // Not sure about either. List both and then check
      if(!(prevIsContraction || currIsContraction)) {
        noteContraction(*prevLine);
        noteContraction(*currLine);
      }
    }
    prevLine = currLine;
  }
  if(outOfOrder) {
    if(gVerbose) {
      u_fprintf(log, "\n");
    }
  }
  return outOfOrder;
}

int32_t
detectContractions(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
  int32_t i = 0, j = 0;
  int32_t noContractions = 0;
  // Create and compare doubles:
  Line *backupLines = new Line[size]; 
  Line::copyArray(backupLines, lines, size); 
  // detect contractions

  Line **gLinesBackup = NULL; //new Line*[size]; 

  for(i = 0; i < size; i++) {
    // preserve index and previous
    Line::copyArray(lines, backupLines, size); 
    for(j = 0; j < size; j++) {
      u_strcpy(lines[j].name, backupLines[i].name);
      u_strcat(lines[j].name, backupLines[j].name);
      lines[j].len = backupLines[i].len+backupLines[j].len;     
    }

    if((noContractions += analyzeContractions(gLines, size, comparer)) && gDebug) {
      if(gLinesBackup == NULL) {
        gLinesBackup = new Line*[size];
      }
      // Show the sorted doubles, for debugging
      setArray(gLinesBackup, lines, size);
      qsort(gLinesBackup, size, sizeof(Line *), comparer);
      //setIndexes(gLinesBackup, size);
      analyzeStrength(gLinesBackup, size, comparer);
      printOrdering(gLinesBackup, size, log);
    }
    if(!gQuiet) {
      u_fprintf(log, ".");
    }
  }
  if(!gQuiet) {
    u_fprintf(log, "\n");
  }
  delete[] backupLines; 
  if(gLinesBackup) {
    delete[] gLinesBackup; 
  }
  return noContractions;
}

// gLines in this function is an array of sorted pointers.
// Contractions are already included. 
int32_t
detectExpansions(Line **gLines, int32_t size, CompareFn comparer) {
  UErrorCode status = U_ZERO_ERROR;
  // detect expansions

  UColAttributeValue startStrength = UCOL_OFF, endStrength = UCOL_OFF, 
    strength = UCOL_OFF, previousStrength = UCOL_OFF;
  Line start, end, src;
  Line *startP = &start, *endP = &end, *srcP = &src;
  Line *current = NULL;
  memset(startP, 0, sizeof(Line));
  memset(endP, 0, sizeof(Line));
  memset(srcP, 0, sizeof(Line));
  int32_t srcLen;
  int32_t i = 0, j = 0, k = 0;
  for(i = 0; i < size; i++) {
    u_strcpy(start.name, (*(gLines+i))->name);
    u_strcpy(end.name, (*(gLines+i))->name);
    srcLen = (*(gLines+i))->len;
    u_strcpy(start.name+srcLen, (*(gLines))->name);
    start.len = srcLen + (*(gLines))->len;
    u_strcpy(end.name+srcLen, (*(gLines+size-1))->name);
    end.len = srcLen + (*(gLines+size-1))->len;

    for(k = 0; k < size; k++) { // k is index of a thing that is not doubled
      current = *(gLines+k);
      // see if we have moved to front
      // has it moved to the very beggining
      if((startStrength = probeStrength((gLines+k), &startP, comparer)) != UCOL_OFF) {
        continue; // this one is in the front
      }
      // has it moved to the very end?
      if((endStrength = probeStrength(&endP, (gLines+k), comparer)) != UCOL_OFF) {
        continue; // this one is in the back
      }
      // Potential Expansion     
      if(gDebug) { //gVerbose
        u_fprintf(log, "Possible expansion: ");
        escapeALine(*(gLines+k), log);
        u_fprintf(log, " ");
      }
      // Now we have to make sure that this is really an expansion
      // First, we have to find it
      u_strcpy(src.name, (*(gLines+i))->name);
      for(j = 0; j < size; j++) {
        u_strcpy(src.name+srcLen, (*(gLines+j))->name);
        src.len = srcLen + (*(gLines+j))->len;
        if((strength = probeStrength(&srcP, (gLines+k), comparer)) == UCOL_OFF) {
          strength = probeStrength((gLines+k), &srcP, comparer);
          // we found it *(gLines+j-1) is the element that is interesting
          // since gLines+j-1 < gLines+k < gLines+j
          if(gDebug) { //gVerbose
            u_fprintf(log, "i = %i, k = %i, j = %i ", i, k, j);
            escapeALine(*(gLines+i), log);
            escapeALine(*(gLines+j-1), log);
            printStrength(previousStrength, log);
            escapeALine(current, log);
            printStrength(strength, log);
            escapeALine(*(gLines+i), log);
            escapeALine(*(gLines+j), log);
            u_fprintf(log, "\n");
          }
          // check whether it is a contraction that is the same as an expansion
          // or a multi character that doesn't do anything
          current->addExpansionHit(i, j);
          current->isExpansion = TRUE;
          current->expIndex = k;
          // cache expansion
          gExpansions.put(UnicodeString(current->name, current->len), current, status); //new Line(*current)
          break;
        }
        previousStrength = strength;
      }
    }
    if(!gQuiet) {
      u_fprintf(log, ".");
    }
  }  
  if(!gQuiet) {
    u_fprintf(log, "\n");
  }
  // now we have identified possible expansions. We need to find out how do they expand. 
  // Let's iterate over expansions cache - it's easier.
  const UHashElement *el = NULL;
  int32_t hashIndex = -1;
  Line *doubles = new Line[size*10]; 
  Line **sorter = new Line*[size*10];
  int32_t currSize = 0;
  int32_t newSize = 0;
  Line *prev = NULL;
  Line *next = NULL;
  Line *origin = NULL;
  int result = 0;
  // Make a list of things in the vincinity of expansion candidate
  // in expansionPrefixes and expansionAfter we have stored the
  // prefixes of stuff that caused the detection of an expansion
  // and a position where the expansion was. 
  // For example (icu, de__PHONEBOOK), we had:
  // aE <<< \u00E4 < af
  // AD < \u00E4 <<< Ae
  // From that we will construct the following sequence:
  // AD < aE <<< \u00E4/ <<< Ae < af
  // then we will take the vincinity of \u00E4:
  // aE <<< \u00E4/ <<< Ae
  // then we will choose the smallest expansion to be the expansion
  // part: 'e'.
  // if there is equality, we choose the equal part:
  // (win32, de__PHONEBOOK):
  // AD < \u00E4/ = ae <<< aE <<< Ae
  // we choose 'e'.

  while((el = gExpansions.nextElement(hashIndex)) != NULL) {
    newSize = 0;
    current = (Line *)el->value.pointer;
    currSize = size*current->expansionPrefixesSize;
    if(gDebug) {
      escapeALine(current, log);
      u_fprintf(log, " Number: %i\n", current->expansionPrefixesSize);
    }
    // construct the doubles 
    for(i = 0; i < current->expansionPrefixesSize; i++) {
      doubles[newSize].suffix = current->expansionAfter[i]-1;
      doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]-1));
      doubles[newSize].suffix = current->expansionAfter[i];
      doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]));
    }
    // add the expansion we're observing
    doubles[newSize++] = *current;
    setArray(sorter, doubles, newSize);
    qsort(sorter, newSize, sizeof(Line*), comparer);
    analyzeStrength(sorter, newSize, comparer);
    if(gDebug) {
      printOrdering(sorter, newSize, log);
    }
    i = 0;
    while(**(sorter+i) != *current) {
      i++;
    }
    // find the two additions
    if((*(sorter+i))->strength == UCOL_IDENTICAL) {
      // if we ae id
      origin = *(gLines+((*(sorter+i-1))->suffix));
      u_strcpy(current->expansionString, origin->name);
      current->expLen = origin->len;
    } else if(i < newSize-1 && (*(sorter+i+1))->strength == UCOL_IDENTICAL) {
      origin = *(gLines+((*(sorter+i+1))->suffix));
      u_strcpy(current->expansionString, origin->name);
      current->expLen = origin->len;
    } else {
      if(i > 0) {
        prev = *(gLines+(*(sorter+i-1))->suffix);
        if(i < newSize-1) {
          next = *(gLines+(*(sorter+i+1))->suffix);
          result = comparer(&prev, &next);
          if(result <= 0) {
            u_strcpy(current->expansionString, prev->name);
            current->expLen = prev->len;
          } else {
            u_strcpy(current->expansionString, next->name);
            current->expLen = next->len;
          }
        }      
      }
      if(0) { //if(gDebug)
        u_fprintf(log, "Expansion is: ");
        escapeALine(current, log);
        u_fprintf(log, "/");
        escapeExpansion(current, log);
        u_fprintf(log, "\n");
      }
    }
    noteExpansion(current);
    //noteExpansion(gLines, current, size, comparer);
    if(!gQuiet) {
      u_fprintf(log, ".");
    }
  }
  if(!gQuiet) {
    u_fprintf(log, "\n");
  }
  delete[] doubles;
  delete[] sorter;
  return gExpansions.count();
}

UBool
isTailored(Line *line, UErrorCode &status) {
  UBool result = FALSE;
  UCollationElements *tailoring = ucol_openElements(gCol, line->name, line->len, &status);
  UCollationElements *uca = ucol_openElements(gUCA, line->name, line->len, &status);

  int32_t tailElement = UCOL_NULLORDER;
  int32_t ucaElement = UCOL_NULLORDER;

  do {
    do {
      tailElement = ucol_next(tailoring, &status);
    } while(tailElement == 0);
    do {
      ucaElement = ucol_next(uca, &status);
    } while(ucaElement == 0);
    if(tailElement != ucaElement) {
      result = TRUE;
      break;
    }
  } while (tailElement != UCOL_NULLORDER && ucaElement != UCOL_NULLORDER);

  ucol_closeElements(tailoring);
  ucol_closeElements(uca);
  return result;
}

void
reduceUntailored(Line **gLines, int32_t size){
  UErrorCode status = U_ZERO_ERROR;
  Line *current = *(gLines);
  Line *previous = NULL;
  while(current) {
    // if the current line is not tailored according to the UCA
    if(!isTailored(current, status)) {
      // we remove it
      current->isRemoved = TRUE;
    } else {
      // if it's tailored 
      if(current->previous && current->previous->isRemoved == TRUE) {
        previous = current->previous;
        while(previous && (previous->strength > current->strength || previous->isExpansion || previous->isContraction) && previous->isRemoved) {
          if(previous->previous && previous->previous->isRemoved) {
            previous = previous->previous;
          } else {
            break;
          }
        }
        if(previous) {
          previous->isReset = TRUE;
        } else {
          (*(gLines))->isReset = TRUE;
        }
      }
    }
    current = current->next;
  }
}

void
constructAndAnalyze(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
  int32_t i = 0, j = 0, k = 0;
  // setup our compare arrays to point to single set.

  // For contractions we need a block of data
  setArray(gLines, lines, size);
  //size = setArray(gLines);

  qsort(gLines, size, sizeof(Line *), comparer);

  // Establish who is previous according to the sort order
  //setIndexes(gLines, size);

  analyzeStrength(gLines, size, comparer);
  if(gVerbose) {
    u_fprintf(log, "Ordering:\n");
    printOrdering(gLines, size, log);
  }

  //showDifferences(exemplarSetSize);
  //dumpData(exemplarSetSize);

  if(!gQuiet) {
    u_fprintf(log, "Detecting contractions?\n");
  }
  int32_t noContractions = 0;
  noContractions = detectContractions(gLines, lines, size, comparer);
  if(!gQuiet) {
    u_fprintf(log, "Detected %i contractions\n", noContractions);
  }

  // now we have suspected contractions in the table
  // we have to re-sort the things 
  size = setArray(gLines);
  qsort(gLines, size, sizeof(Line *), comparer);
  analyzeStrength(gLines, size, comparer);

  if(!gQuiet) {
    u_fprintf(log, "Detecting expansions\n");
  }
  int32_t noExpansions = detectExpansions(gLines, size, comparer);
  if(!gQuiet) {
    u_fprintf(log, "Detected %i expansions\n", noExpansions);
  }

  positionExpansions(gLines, size, comparer);

  if(gVerbose) {
    u_fprintf(log, "After positioning expansions:\n");
    printOrdering(gLines, size, log, TRUE);
  }
  //reduceUntailored(gLines, size);
  if(!gQuiet) {
    u_fprintf(out, "Final result\n");
  }
  printOrdering(gLines, size, out, TRUE);
  printOrdering(gLines, size, log, TRUE);
}

// Check whether upper case comes before lower case or vice-versa
int32_t 
checkCaseOrdering(void) {
  UChar stuff[][3] = {
    { 0x0061, separatorChar, 0x0061}, //"aa",
    { 0x0061, separatorChar, 0x0041 }, //"a\\u00E0",
    { 0x0041, separatorChar, 0x0061 }, //"\\u00E0a",
    { 0x0041, separatorChar, 0x0041 }, //"\\u00E0a",
    //{ 0x00E0, separatorChar, 0x00E0 }  //"\\u00E0\\u00E0"
  };
  const int32_t size = sizeof(stuff)/sizeof(stuff[0]);

  Line **sortedLines = new Line*[size];
  Line lines[size];

  int32_t i = 0;
  int32_t ordered = 0, reversed = 0;

  for(i = 0; i < size; i++) {
    lines[i].setName(stuff[i], 3);
  }
  setArray(sortedLines, lines, size);
  qsort(sortedLines, size, sizeof(Line*), gComparer);

  for(i = 0; i < size; i++) {
    if(*(sortedLines+i) == &lines[i]) {
      ordered++;
    }
    if(*(sortedLines+i) == &lines[size-i-1]) {
      reversed++;
    }
  }

  delete[] sortedLines;
  if(ordered == size) {
    return 0; // in normal order
  } else if(reversed == size) {
    return 1; // in reversed order
  } else {
    return -1; // unknown order
  }
}


// Check whether the secondaries are in the straight or reversed order
int32_t 
checkSecondaryOrdering(void) {
  UChar stuff[][5] = {
    { 0x0061, separatorChar, 0x0061, separatorChar, 0x00E0 }, //"aa",
    { 0x0061, separatorChar, 0x00E0, separatorChar, 0x0061 }, //"a\\u00E0",
    { 0x00E0, separatorChar, 0x0061, separatorChar, 0x0061 }, //"\\u00E0a",
    //{ 0x00E0, separatorChar, 0x00E0 }  //"\\u00E0\\u00E0"
  };
  const int32_t size = sizeof(stuff)/sizeof(stuff[0]);

  Line **sortedLines = new Line*[size];
  Line lines[size];

  int32_t i = 0;
  int32_t ordered = 0, reversed = 0;

  for(i = 0; i < size; i++) {
    lines[i].setName(stuff[i], 5);
  }
  setArray(sortedLines, lines, size);
  qsort(sortedLines, size, sizeof(Line*), gComparer);

  for(i = 0; i < size; i++) {
    if(*(sortedLines+i) == &lines[i]) {
      ordered++;
    }
    if(*(sortedLines+i) == &lines[size-i-1]) {
      reversed++;
    }
  }

  delete[] sortedLines;
  if(ordered == size) {
    return 0; // in normal order
  } else if(reversed == size) {
    return 1; // in reversed order
  } else {
    return -1; // unknown order
  }
}

// We have to remove ignorable characters from the exemplar set,
// otherwise, we get messed up results
void removeIgnorableChars(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
  UnicodeSet ignorables, primaryIgnorables;
  UnicodeSetIterator exemplarUSetIter(exemplarUSet);
  exemplarUSetIter.reset();
  Line empty;
  Line *emptyP = &empty;
  Line current;
  Line *currLine = &current;
  UColAttributeValue strength = UCOL_OFF;


  while(exemplarUSetIter.next()) {
    if(exemplarUSetIter.isString()) { // process a string
      u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
      currLine->len = exemplarUSetIter.getString().length();
      strength = probeStrength(&emptyP, &currLine, comparer);
      if(strength == UCOL_IDENTICAL) {
        ignorables.add(exemplarUSetIter.getString());
      } else if(strength > UCOL_PRIMARY) {
        primaryIgnorables.add(exemplarUSetIter.getString());
      }
    } else { // process code point
      UBool isError = FALSE;
      UChar32 codePoint = exemplarUSetIter.getCodepoint();
      currLine->len = 0;
      U16_APPEND(currLine->name, currLine->len, 25, codePoint, isError);
      strength = probeStrength(&emptyP, &currLine, comparer);
      if(strength == UCOL_IDENTICAL) {
        ignorables.add(codePoint);
      } else if(strength > UCOL_PRIMARY) {
        primaryIgnorables.add(codePoint);
      }
    }
  }


  exemplarUSet.removeAll(ignorables);
  exemplarUSet.removeAll(primaryIgnorables);

  UnicodeString removedPattern;
  if(ignorables.size()) {
    u_fprintf(log, "Ignorables:\n");
    ignorables.toPattern(removedPattern, TRUE);
    removedPattern.setCharAt(removedPattern.length(), 0);
    escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
    u_fprintf(log, "\n");
  }
  if(primaryIgnorables.size()) {
    u_fprintf(log, "Primary ignorables:\n");
    primaryIgnorables.toPattern(removedPattern, TRUE);
    removedPattern.setCharAt(removedPattern.length(), 0);
    escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
    u_fprintf(log, "\n");
  }

}

// TODO: develop logic for choosing boundary characters - right now it is hardcoded
// It should be a function of used scripts. Also, check whether we need to save 
// used script names
void addUtilityChars(UnicodeSet &exemplarUSet, UErrorCode &status) {

  // in order to get nice rules, we need to add some characters to the
  // starting set. These are mostly parts of compatibity composed characters,
  // such as L-middle dot (middle dot is 0x00B7). If we don't add these, we would
  // get a reset at a funky character, such as L-middle dot. This list will probably
  // grow.
  exemplarUSet.add(0x00B7);

  // these things represent a script before the target script and 
  // a script after. More logic should be added so that these characters are
  // chosen automatically

  exemplarUSet.add(0x0038);
  exemplarUSet.add(0x0039);

  //exemplarUSet.add(0x0433);
  //exemplarUSet.add(0x0436);
  exemplarUSet.add(0xfa29);
  exemplarUSet.add(0xfa28);
}

void
getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) {
  // first we fill out structures with exemplar characters.
  UResourceBundle *res = ures_open(NULL, locale, &status);
  int32_t exemplarLength = 0;
  UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status);
  exemplars.clear();
  exemplars.applyPattern(exemplarString, status);
  ures_close(res);
}

void
prepareStartingSet(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
  int32_t i = 0;
  UnicodeString exemplarString;
  exemplarUSet.toPattern(exemplarString);
  // Produce case closure of exemplar characters
  // Then we want to figure out what is the script of the exemplar characters
  // just pick several and see their script
  const char* usedScriptNames[USCRIPT_CODE_LIMIT];
  int32_t numberOfUsedScripts = 0;
  char scriptSetPattern[256];
  UnicodeString pattern; // for debugging
  UChar32 exChar = -1;
  while(exemplarUSet.size() != 0 && (exChar = exemplarUSet.charAt(0)) != -1) { 
    int32_t scriptNo = u_getIntPropertyValue(exChar, UCHAR_SCRIPT);
    usedScriptNames[numberOfUsedScripts] = u_getPropertyValueName(UCHAR_SCRIPT, scriptNo, U_SHORT_PROPERTY_NAME);
    sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[numberOfUsedScripts]);
    numberOfUsedScripts++;
    UnicodeSet scriptSet(UnicodeString(scriptSetPattern, ""), status);
    exemplarUSet.removeAll(scriptSet);
    exemplarUSet.toPattern(pattern, TRUE);
  }
  exemplarUSet.clear();

  // always add ASCII
  //exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0020-\\u007f]", ""), status));
  exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0041-\\u005b]", ""), status));
  if(gExemplar) {
    exemplarUSet.applyPattern(exemplarString, status);
    exemplarUSet.closeOver(USET_CASE);
    if(!gQuiet) {
      u_fprintf(out, "ICU exemplar characters:\n");
      escapeString(exemplarString.getBuffer(), exemplarString.length(), out);
      u_fprintf(out, "\n");
    }
  } else {
    if(!gQuiet) {
      u_fprintf(out, "Using scripts:\n");
    }
    // add interesting scripts
    for(i = 0; i < numberOfUsedScripts; i++) {
      sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[i]);
      exemplarUSet.addAll(UnicodeSet(UnicodeString(scriptSetPattern, ""), status));
      if(!gQuiet) {
        u_fprintf(out, "%s\n", scriptSetPattern);
      }
    }
  }


  removeIgnorableChars(exemplarUSet, comparer, status);

  addUtilityChars(exemplarUSet, status);

/*
  // try to check whether tailored set and exemplar characters match.
  USet *tailored = ucol_getTailoredSet(gCol, &status);
  UBool tailoredContained = exemplarUSet.containsAll(*((UnicodeSet *)tailored));
  if(!tailoredContained) {
    ((UnicodeSet *)tailored)->removeAll(exemplarUSet);
    UnicodeString pattern;
    ((UnicodeSet *)tailored)->toPattern(pattern, TRUE);
  }
  uset_close(tailored);
*/

  //return exemplarUSet;
}

void 
setOutputFile(const char *name, UErrorCode &status) {
  int32_t i = 0;
  char filename[256];
  strcpy(filename, name);
  for(i = 0; i < gPlatformNo; i++) {
    strcat(filename, "_");
    strcat(filename, platforms[gPlatformIndexes[i]].name);
  }
  if(gExemplar) {
    strcat(filename, "_exemplar");
  } else {
    strcat(filename, "_script");
  }
  strcat(filename, ".utf16.txt");
  out = u_fopen(filename, "wb", "en", "utf-16");
}

void
processCollator(UCollator *col, UErrorCode &status) {
  int32_t i = 0;
  gCol = col;
  UChar ruleString[16384];
  int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384);
  if(!gQuiet) {
    u_fprintf(out, "ICU rules:\n");
    printRules(ruleString, ruleStringLength, out);
    printRules(ruleString, ruleStringLength, log);
    //escapeString(ruleString, ruleStringLength, out);
    u_fprintf(out, "\n");
  }
  const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status);
  UnicodeSet exemplarUSet;
  if(locale) {
    getExemplars(locale, exemplarUSet, status);
  } else {
    exemplarUSet = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status));
  }


  for(i = 0; i < gPlatformNo; i++) {
    u_fprintf(out, "\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[i]].name);
    gComparer = platforms[gPlatformIndexes[i]].comparer;

    prepareStartingSet(exemplarUSet, gComparer, status);
    int32_t itemLen = 0;
    // get the number of all the items from the set (both codepoints and strings)
    int32_t exemplarSetSize = exemplarUSet.size();
    UnicodeSetIterator exemplarUSetIter(exemplarUSet);

    // allocate ICU lines
    gICULines = new Line*[exemplarSetSize*5]; 
    int32_t j = 0;
    int32_t linesCount = 0;
    Line *lines = new Line[exemplarSetSize]; 

    int32_t reversedSecondary = checkSecondaryOrdering();
    if(reversedSecondary == 0) {
      u_fprintf(out, "Secondaries do not seem to be reversed\n");
    } else if(reversedSecondary == 1) {
      u_fprintf(out, "Secondaries are reversed\n");
      if(gComparer == ICUstrcmp) {
        ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
      }
    } else {
      u_fprintf(out, "Cannot conclude if secondaries are reversed\n");
    }

    int32_t reversedCase = checkCaseOrdering();
    if(reversedCase == 0) {
      u_fprintf(out, "Case does not seem to be reversed\n");
    } else if(reversedCase == 1) {
      u_fprintf(out, "Case is reversed\n");
      if(gComparer == ICUstrcmp) {
        ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_OFF, &status);
      }
    } else {
      u_fprintf(out, "Cannot conclude if case is reversed\n");
    }
      
    exemplarUSetIter.reset();
    gElements.removeAll();
    gExpansions.removeAll();
    linesCount = 0;

    while(exemplarUSetIter.next()) {
      Line *currLine = lines+linesCount;
      if(exemplarUSetIter.isString()) { // process a string
        u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
        currLine->len = exemplarUSetIter.getString().length();
      } else { // process code point
        UBool isError = FALSE;
        currLine->len = 0;
        U16_APPEND(currLine->name, currLine->len, 25, exemplarUSetIter.getCodepoint(), isError);
      }
      currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
      currLine->index = linesCount;
      linesCount++;
      noteElement(currLine);
    }
    constructAndAnalyze(gICULines, lines, exemplarSetSize, gComparer);

    delete[] lines; 
  }


  // cleanup globals
  delete[] gICULines; 
  u_fflush(out);
  u_fclose(out);
  ucol_close(gCol);
}

void
processLocale(const char *locale, UErrorCode &status) {
  gWinLCID = uloc_getLCID(locale);

  UCollator *col = ucol_open(locale, &status);

  setOutputFile(locale, status);

  u_fprintf(out, "Locale %s (LCID:%06X)\n", locale, gWinLCID);

  processCollator(col, status);
}

UBool 
hasCollationElements(const char *locName) {

  UErrorCode status = U_ZERO_ERROR;
  UResourceBundle *ColEl = NULL;

  UResourceBundle *loc = ures_open(NULL, locName, &status);;

  if(U_SUCCESS(status)) {
    status = U_ZERO_ERROR;
    ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status);
    if(status == U_ZERO_ERROR) { /* do the test - there are real elements */
      ures_close(ColEl);
      ures_close(loc);
      return TRUE;
    }
    ures_close(ColEl);
    ures_close(loc);
  }
  return FALSE;
}

int
main(int argc,
     char* argv[])
{
  UErrorCode status = U_ZERO_ERROR;
  err = u_finit(stderr, "en", "latin-1");
  log = u_finit(stdout, "en", "latin-1");

/*
  USet *wsp = uprv_openRuleWhiteSpaceSet(&status);
  uset_add(wsp, 0x0041);
  uset_remove(wsp, 0x0041);
  UnicodeString pat;
  ((UnicodeSet *)wsp)->toPattern(pat, TRUE);
  pat.setCharAt(pat.length(), 0);
  escapeString(pat.getBuffer(), pat.length(), log);
  u_fflush(log);
*/

  UTransliterator *anyHex = utrans_open("[^\\u000a\\u0020-\\u007f] Any-Hex/Java", UTRANS_FORWARD, NULL, 0, NULL, &status);
  u_fsettransliterator(log, U_WRITE, anyHex, &status);

  processArgs(argc, argv, status);
  int32_t i = 0;


  gElements.setValueDeleter(deleteLineElement);


  if(U_FAILURE(status) || gPlatformNo == 0) {
    return -1;
  }

  gUCA = ucol_open("root", &status);

  if(gRulesStdin) {
    char buffer[1024];
    UChar ruleBuffer[16384];
    UChar *rules = ruleBuffer;
    int32_t maxRuleLen = 16384;
    int32_t rLen = 0;
    while(gets(buffer)) {
      if(buffer[0] != '/' && buffer[1] != '/') {
        rLen = u_unescape(buffer, rules, maxRuleLen);
        rules += rLen;
        maxRuleLen -= rLen;
      }
    }
    UParseError parseError;
    //escapeString(ruleBuffer, rules-ruleBuffer, log);//
    u_fprintf(log, "%U\n", ruleBuffer);

    UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status);
    if(U_SUCCESS(status)) {
      setOutputFile("stdinRules", status);
      processCollator(col, status);
    } else {
      u_fprintf(err, "Error %s\n", u_errorName(status));
    }
  } else {

    if(gLocale) {
      processLocale(gLocale, status);
    } else if(gLocaleNo) {
      for(i = 0; i < gLocaleNo; i++) {
        processLocale(gLocales[i], status);
      }
    } else { // do the loop through all the locales
      int32_t noOfLoc = uloc_countAvailable();
      const char *locName = NULL;
      for(i = 0; i<noOfLoc; i++) {
        status = U_ZERO_ERROR;
        locName = uloc_getAvailable(i);
        if(hasCollationElements(locName)) {
          processLocale(locName, status);
        }
      }
    }
  }


  ucol_close(gUCA);

  u_fflush(log);
  u_fclose(log);
  u_fflush(err);
  u_fclose(err);

  return 0;
}