From 34e9e8fc9fce3d0a1c06a81eba4a56e0dfa0df7f Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Tue, 18 Apr 2000 16:56:02 +0000
Subject: [PATCH] ICU-130 32-bit exception values, add mirror mappings,
 overhaul...

X-SVN-Rev: 1141
---
 icu4c/source/tools/genprops/genprops.c | 698 ++++++++++++++-----------
 icu4c/source/tools/genprops/genprops.h |   7 +-
 icu4c/source/tools/genprops/store.c    | 244 ++++++---
 3 files changed, 565 insertions(+), 384 deletions(-)

diff --git a/icu4c/source/tools/genprops/genprops.c b/icu4c/source/tools/genprops/genprops.c
index a2932171b3..41e262e1d6 100644
--- a/icu4c/source/tools/genprops/genprops.c
+++ b/icu4c/source/tools/genprops/genprops.c
@@ -24,18 +24,190 @@
 #include <stdlib.h>
 #include "unicode/utypes.h"
 #include "unicode/uchar.h"
+#include "unicode/putil.h"
 #include "cmemory.h"
 #include "cstring.h"
-#include "filestrm.h"
 #include "unicode/udata.h"
 #include "unewdata.h"
+#include "uoptions.h"
+#include "uparse.h"
 #include "genprops.h"
-#include "unicode/putil.h"
 
 extern bool_t beVerbose=FALSE, haveCopyright=TRUE;
 
-/* general categories */
+/* prototypes --------------------------------------------------------------- */
 
+static void
+init(void);
+
+static void
+parseMirror(const char *filename, UErrorCode *pErrorCode);
+
+static void
+parseDB(const char *filename, UErrorCode *pErrorCode);
+
+/* -------------------------------------------------------------------------- */
+
+static UOption options[]={
+    UOPTION_HELP_H,
+    UOPTION_HELP_QUESTION_MARK,
+    UOPTION_VERBOSE,
+    UOPTION_COPYRIGHT,
+    UOPTION_DESTDIR,
+    UOPTION_SOURCEDIR,
+    { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
+};
+
+extern int
+main(int argc, const char *argv[]) {
+    char filename[300];
+    const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
+    char *basename=NULL;
+    UErrorCode errorCode=U_ZERO_ERROR;
+
+    /* preset then read command line options */
+    options[4].value=u_getDataDirectory();
+    options[5].value="";
+    options[6].value="3.0.0";
+    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
+
+    /* error handling, printing usage message */
+    if(argc<0) {
+        fprintf(stderr,
+            "error in command line argument \"%s\"\n",
+            argv[-argc]);
+    }
+    if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
+        fprintf(stderr,
+            "usage: %s [-options] [suffix]\n"
+            "\tread the UnicodeData.txt file and other Unicode properties files and\n"
+            "\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
+            "\toptions:\n"
+            "\t\t-h or -? or --help  this usage text\n"
+            "\t\t-v or --verbose     verbose output\n"
+            "\t\t-c or --copyright   include a copyright notice\n"
+            "\t\t-d or --destdir     destination directory, followed by the path\n"
+            "\t\t-s or --sourcedir   source directory, followed by the path\n"
+            "\t\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
+            "\t\tsuffix              suffix that is to be appended with a '-'\n"
+            "\t\t                    to the source file basenames before opening;\n"
+            "\t\t                    'genprops new' will read UnicodeData-new.txt etc.\n",
+            argv[0]);
+        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
+    }
+
+    /* get the options values */
+    beVerbose=options[2].doesOccur;
+    haveCopyright=options[3].doesOccur;
+    srcDir=options[5].value;
+    destDir=options[4].value;
+
+    if(argc>=2) {
+        suffix=argv[1];
+    } else {
+        suffix=NULL;
+    }
+
+    setUnicodeVersion(options[6].value);
+
+    /* prepare the filename beginning with the source dir */
+    uprv_strcpy(filename, srcDir);
+    basename=filename+uprv_strlen(filename);
+    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
+        *basename=U_FILE_SEP_CHAR;
+    }
+
+    /* initialize */
+    init();
+    initStore();
+
+    /* process Mirror.txt */
+    if(suffix==NULL) {
+        uprv_strcpy(basename, "Mirror.txt");
+    } else {
+        uprv_strcpy(basename, "Mirror");
+        basename[6]='-';
+        uprv_strcpy(basename+7, suffix);
+        uprv_strcat(basename+7, ".txt");
+    }
+    parseMirror(filename, &errorCode);
+
+    /* process UnicodeData.txt */
+    if(suffix==NULL) {
+        uprv_strcpy(basename, "UnicodeData.txt");
+    } else {
+        uprv_strcpy(basename, "UnicodeData");
+        basename[11]='-';
+        uprv_strcpy(basename+12, suffix);
+        uprv_strcat(basename+12, ".txt");
+    }
+    parseDB(filename, &errorCode);
+
+    /* process parsed data */
+    if(U_SUCCESS(errorCode)) {
+        repeatProps();
+        compactProps();
+        compactStage3();
+        compactStage2();
+
+        /* write the properties data file */
+        generateData(destDir);
+    }
+
+    return errorCode;
+}
+
+static void
+init(void) {
+}
+
+/* parser for Mirror.txt ---------------------------------------------------- */
+
+#define MAX_MIRROR_COUNT 2000
+
+static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2];
+static int32_t mirrorCount=0;
+
+static void
+MirrorCode(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    char *end;
+
+    mirrorMappings[mirrorCount][fieldNr]=uprv_strtoul(start, &end, 16);
+    if((end-start)<1 || end!=limit) {
+        fprintf(stderr, "genprops: syntax error in Mirror.txt field %d at %s\n", fieldNr, start);
+        exit(U_PARSE_ERROR);
+    }
+}
+
+static void
+MirrorFinish(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    if(++mirrorCount==MAX_MIRROR_COUNT) {
+        fprintf(stderr, "genprops: too many mirror mappings\n");
+        exit(U_INDEX_OUTOFBOUNDS_ERROR);
+    }
+}
+
+static UParseFieldFn *mirrorFields[4]={
+    NULL,
+    MirrorCode,
+    MirrorCode,
+    MirrorFinish
+};
+
+static void
+parseMirror(const char *filename, UErrorCode *pErrorCode) {
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return;
+    }
+
+    u_parseDelimitedFile(filename, ';', mirrorFields, 2, NULL, pErrorCode);
+}
+
+/* parser for UnicodeData.txt ----------------------------------------------- */
+
+#define NO_NUMERIC_VALUE ((uint32_t)15821005)
+
+/* general categories */
 extern const char *const
 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
         NULL,
@@ -55,327 +227,229 @@ bidiNames[U_CHAR_DIRECTION_COUNT]={
     "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
 };
 
-/* prototypes --------------------------------------------------------------- */
+/* control code properties */
+static const struct {
+    uint32_t code;
+    uint8_t generalCategory;
+} controlProps[]={
+    /* TAB */   0x9, U_SPACE_SEPARATOR,
+    /* VT */    0xb, U_SPACE_SEPARATOR,
+    /* LF */    0xa, U_PARAGRAPH_SEPARATOR,
+    /* FF */    0xc, U_LINE_SEPARATOR,
+    /* CR */    0xd, U_PARAGRAPH_SEPARATOR,
+    /* FS */    0x1c, U_PARAGRAPH_SEPARATOR,
+    /* GS */    0x1d, U_PARAGRAPH_SEPARATOR,
+    /* RS */    0x1e, U_PARAGRAPH_SEPARATOR,
+    /* US */    0x1f, U_SPACE_SEPARATOR,
+    /* NL */    0x85, U_PARAGRAPH_SEPARATOR
+};
 
 static void
-init(void);
+UnicodeDataInit(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
 
-static void
-parseDB(FileStream *in);
-
-static int16_t
-getField(char *line, int16_t start, int16_t limit);
-
-static void
-checkLineIndex(uint32_t code, int16_t limit, int16_t length);
-
-/* -------------------------------------------------------------------------- */
-
-extern int
-main(int argc, char *argv[]) {
-    FileStream *in;
-    const char *destdir = 0;
-    char *arg, *filename=NULL;
-    int i;
-
-    if(argc<=1) {
-        fprintf(stderr,
-            "usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
-            "\tread the UnicodeData.txt file and \n"
-            "\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
-            "\toptions:\n"
-            "\t\t-v[+|-]  verbose output\n"
-            "\t\t-c[+|-]  do (not) include a copyright notice\n"
-            "\t\tfilename  absolute path/filename for the\n"
-            "\t\t\tUnicode database text file (default: standard input)\n",
-            argv[0]);
-    }
-
-    for(i=1; i<argc; ++i) {
-        arg=argv[i];
-        if(arg[0]=='-') {
-            switch(arg[1]) {
-            case 'v':
-                beVerbose= arg[2]=='+';
-                break;
-            case 'c':
-                haveCopyright= arg[2]=='+';
-                break;
-            default:
-                break;
-            }
-        } else {
-            filename=arg;
-        }
-    }
-
-    if(filename==NULL) {
-        in=T_FileStream_stdin();
-    } else {
-        in=T_FileStream_open(filename, "r");
-        if(in==NULL) {
-            fprintf(stderr, "genprops: unable to open input file %s\n", filename);
-            exit(U_FILE_ACCESS_ERROR);
-        }
-    }
-
-    if (!destdir) {
-        destdir = u_getDataDirectory();
-    }
-
-    init();
-    initStore();
-    parseDB(in);
-    repeatProps();
-    compactProps();
-    compactStage3();
-    compactStage2();
-    generateData(destdir);
-
-    if(in!=T_FileStream_stdin()) {
-        T_FileStream_close(in);
-    }
-
-    return 0;
+    /* reset the properties */
+    uprv_memset(p, 0, sizeof(Props));
+    p->numericValue=NO_NUMERIC_VALUE;
 }
 
 static void
-init(void) {
-}
-
-/* parsing ------------------------------------------------------------------ */
-
-static void
-parseDB(FileStream *in) {
-    char line[300];
+UnicodeDataCode(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
     char *end;
-    Props p;
-    uint32_t value;
-    int16_t start, limit, length, i;
-    bool_t hasNumericValue;
 
-    while(T_FileStream_readLine(in, line, sizeof(line))!=NULL) {
-        length=uprv_strlen(line);
-
-        /* remove trailing newline characters */
-        while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
-            line[--length]=0;
-        }
-
-        /* reset the properties */
-        uprv_memset(&p, 0, sizeof(p));
-        hasNumericValue=FALSE;
-
-        /* get the character code, field 0 */
-        p.code=uprv_strtoul(line, &end, 16);
-        limit=end-line;
-        if(limit<1 || *end!=';') {
-            fprintf(stderr, "genprops: syntax error in field 0 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-
-        /* skip character name, field 1 */
-        checkLineIndex(p.code, ++limit, length);
-        limit=getField(line, limit, length);
-
-        /* get general category, field 2 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        limit=getField(line, start, length);
-        line[limit]=0;
-        for(i=1;;) {
-            if(uprv_strcmp(line+start, genCategoryNames[i])==0) {
-                p.generalCategory=(uint8_t)i;
-                break;
-            }
-            if(++i==U_CHAR_CATEGORY_COUNT) {
-                fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", line+start, p.code);
-                exit(U_PARSE_ERROR);
-            }
-        }
-
-        /* get canonical combining class, field 3 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        p.canonicalCombining=(uint8_t)uprv_strtoul(line+start, &end, 10);
-        limit=end-line;
-        if(start>=limit || *end!=';') {
-            fprintf(stderr, "genprops: syntax error in field 3 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-
-        /* get BiDi category, field 4 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        limit=getField(line, start, length);
-        line[limit]=0;
-        for(i=0;;) {
-            if(uprv_strcmp(line+start, bidiNames[i])==0) {
-                p.bidi=(uint8_t)i;
-                break;
-            }
-            if(++i==U_CHAR_DIRECTION_COUNT) {
-                fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", line+start, p.code);
-                exit(U_PARSE_ERROR);
-            }
-        }
-
-        /* character decomposition mapping, field 5 */
-        /* ### skip for now */
-        checkLineIndex(p.code, ++limit, length);
-        limit=getField(line, limit, length);
-
-        /* decimal digit value, field 6 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        value=uprv_strtoul(line+start, &end, 10);
-        if(*end!=';') {
-            fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-        limit=end-line;
-        if(start<limit) {
-            p.numericValue=value;
-            hasNumericValue=TRUE;
-        }
-
-        /* digit value, field 7 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        value=uprv_strtoul(line+start, &end, 10);
-        if(*end!=';') {
-            fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-        limit=end-line;
-        if(start<limit) {
-            if(hasNumericValue) {
-                if(p.numericValue!=value) {
-                    fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p.code);
-                    exit(U_PARSE_ERROR);
-                }
-            } else {
-                p.numericValue=value;
-                hasNumericValue=TRUE;
-            }
-        }
-
-        /* numeric value, field 8 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        value=uprv_strtoul(line+start, &end, 10);
-        if(value>0 && *end=='/') {
-            p.denominator=uprv_strtoul(end+1, &end, 10);
-        }
-        if(*end!=';') {
-            fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-        limit=end-line;
-        if(start<limit) {
-            if(hasNumericValue) {
-                if(p.numericValue!=value) {
-                    fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p.code);
-                    exit(U_PARSE_ERROR);
-                }
-            } else {
-                p.numericValue=value;
-                hasNumericValue=TRUE;
-            }
-        }
-
-        /* get Mirrored flag, field 9 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        limit=getField(line, start, length);
-        if(line[start]=='Y') {
-            p.isMirrored=1;
-        } else if(limit-start!=1 || line[start]!='N') {
-            fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-
-        /* skip Unicode 1.0 character name, field 10 */
-        checkLineIndex(p.code, ++limit, length);
-        limit=getField(line, limit, length);
-
-        /* skip comment, field 11 */
-        checkLineIndex(p.code, ++limit, length);
-        limit=getField(line, limit, length);
-
-        /* get uppercase mapping, field 12 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        p.upperCase=uprv_strtoul(line+start, &end, 16);
-        limit=end-line;
-        if(*end!=';') {
-            fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-
-        /* get lowercase mapping, field 13 */
-        start=limit+1;
-        checkLineIndex(p.code, start, length);
-        p.lowerCase=uprv_strtoul(line+start, &end, 16);
-        limit=end-line;
-        if(*end!=';') {
-            fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n", p.code);
-            exit(U_PARSE_ERROR);
-        }
-
-        /* get titlecase mapping, field 14 */
-        start=limit+1;
-        if(start<length) {
-            /* this is the last field */
-            p.titleCase=uprv_strtoul(line+start, &end, 16);
-            if(*end!=';' && *end!=0) {
-                fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n", p.code);
-                exit(U_PARSE_ERROR);
-            }
-        }
-
-#if 0
-        /* debug output */
-        if(beVerbose) {
-            printf(
-                "0x%06lx "
-                "%s(%2d) "
-                "comb=%3d "
-                "bidi=%3s(%2d) "
-                "num=%7d/%7d "
-                "mirr=%d "
-                "u%06lx l%06lx t%06lx"
-                "\n",
-                p.code,
-                genCategoryNames[p.generalCategory], p.generalCategory,
-                p.canonicalCombining,
-                bidiNames[p.bidi], p.bidi,
-                p.numericValue, p.denominator,
-                p.isMirrored,
-                p.upperCase, p.lowerCase, p.titleCase);
-        }
-#endif
-
-        addProps(&p);
-    }
-}
-
-static int16_t
-getField(char *line, int16_t start, int16_t limit) {
-    while(start<limit && line[start]!=';') {
-        ++start;
-    }
-    return start;
-}
-
-static void
-checkLineIndex(uint32_t code, int16_t index, int16_t length) {
-    if(index>=length) {
-        fprintf(stderr, "genprops: too few fields at code 0x%lx\n", code);
+    /* get the character code, field 0 */
+    p->code=uprv_strtoul(start, &end, 16);
+    if((end-start)<1 || end!=limit) {
+        fprintf(stderr, "genprops: syntax error in field 0 at %s\n", start);
         exit(U_PARSE_ERROR);
     }
 }
 
+static void
+UnicodeDataCategory(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
+    int i;
+    char c;
+
+    /* get general category, field 2 */
+    c=*limit;
+    *limit=0;
+    for(i=1;;) {
+        if(uprv_strcmp(start, genCategoryNames[i])==0) {
+            p->generalCategory=(uint8_t)i;
+            break;
+        }
+        if(++i==U_CHAR_CATEGORY_COUNT) {
+            fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", start, p->code);
+            exit(U_PARSE_ERROR);
+        }
+    }
+    *limit=c;
+}
+
+static void
+UnicodeDataCombining(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
+    char *end;
+
+    /* get canonical combining class, field 3 */
+    p->canonicalCombining=(uint8_t)uprv_strtoul(start, &end, 10);
+    if(start>=end || end!=limit) {
+        fprintf(stderr, "genprops: syntax error in field 3 at code 0x%lx\n", p->code);
+        exit(U_PARSE_ERROR);
+    }
+}
+
+static void
+UnicodeDataBiDi(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
+    int i;
+    char c;
+
+    /* get BiDi category, field 4 */
+    c=*limit;
+    *limit=0;
+    for(i=0;;) {
+        if(uprv_strcmp(start, bidiNames[i])==0) {
+            p->bidi=(uint8_t)i;
+            break;
+        }
+        if(++i==U_CHAR_DIRECTION_COUNT) {
+            fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", start, p->code);
+            exit(U_PARSE_ERROR);
+        }
+    }
+    *limit=c;
+}
+
+static void
+UnicodeDataNumeric(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
+    uint32_t value;
+    char *end;
+
+    /* decimal digit value, field 6 */
+    /* digit value, field 7 */
+    /* numeric value, field 8 */
+    value=uprv_strtoul(start, &end, 10);
+    if(fieldNr==8 && value>0 && *end=='/') {
+        /* field 8 may contain a fractional value, get the denominator */
+        p->denominator=uprv_strtoul(end+1, &end, 10);
+    }
+    if(end!=limit) {
+        fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", p->code);
+        exit(U_PARSE_ERROR);
+    }
+    if(start<end) {
+        if(p->numericValue!=NO_NUMERIC_VALUE && p->numericValue!=value) {
+            fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p->code);
+            exit(U_PARSE_ERROR);
+        }
+        p->numericValue=value;
+    }
+}
+
+static void
+UnicodeDataMirrored(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
+
+    /* get Mirrored flag, field 9 */
+    if(*start=='Y') {
+        p->isMirrored=1;
+    } else if(limit-start!=1 || *start!='N') {
+        fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n", p->code);
+        exit(U_PARSE_ERROR);
+    }
+}
+
+static void
+UnicodeDataCase(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    Props *p=(Props *)context;
+    char *end;
+    uint32_t mapping;
+
+    /* get uppercase mapping, field 12 */
+    /* get lowercase mapping, field 13 */
+    /* get titlecase mapping, field 14 */
+    mapping=uprv_strtoul(start, &end, 16);
+    if(end!=limit) {
+        fprintf(stderr, "genprops: syntax error in field %d at code 0x%lx\n", fieldNr, p->code);
+        exit(U_PARSE_ERROR);
+    }
+    switch(fieldNr) {
+    case 12:
+        p->upperCase=mapping;
+        break;
+    case 13:
+        p->lowerCase=mapping;
+        break;
+    case 14:
+        p->titleCase=mapping;
+        break;
+    }
+}
+
+static void
+UnicodeDataFinish(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
+    static int32_t mirrorIndex=0;
+    Props *p=(Props *)context;
+    int16_t i;
+
+    if(p->numericValue==NO_NUMERIC_VALUE) {
+        p->numericValue=0;
+    }
+
+    /* override properties for some common control characters */
+    if(p->generalCategory==U_CONTROL_CHAR) {
+        for(i=0; i<sizeof(controlProps)/sizeof(controlProps[0]); ++i) {
+            if(controlProps[i].code==p->code) {
+                p->generalCategory=controlProps[i].generalCategory;
+            }
+        }
+    }
+
+    /* set additional properties from previously parsed files */
+    if(mirrorIndex<mirrorCount && p->code==mirrorMappings[mirrorIndex][0]) {
+        p->mirrorMapping=mirrorMappings[mirrorIndex++][1];
+    }
+
+    addProps(p);
+}
+
+static UParseFieldFn *unicodeDBFields[17]={
+    UnicodeDataInit,
+
+    UnicodeDataCode,
+    NULL,                   /* 1: character name */
+    UnicodeDataCategory,
+    UnicodeDataCombining,
+    UnicodeDataBiDi,
+    NULL,                   /* 5: character decomposition mapping */
+    UnicodeDataNumeric,
+    UnicodeDataNumeric,
+    UnicodeDataNumeric,
+    UnicodeDataMirrored,
+    NULL,                   /* 10: Unicode 1.0 character name */
+    NULL,                   /* 11: comment */
+    UnicodeDataCase,
+    UnicodeDataCase,
+    UnicodeDataCase,
+
+    UnicodeDataFinish
+};
+
+static void
+parseDB(const char *filename, UErrorCode *pErrorCode) {
+    Props p;
+
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return;
+    }
+
+    u_parseDelimitedFile(filename, ';', unicodeDBFields, 15, &p, pErrorCode);
+}
+
 /*
  * Hey, Emacs, please set the following:
  *
diff --git a/icu4c/source/tools/genprops/genprops.h b/icu4c/source/tools/genprops/genprops.h
index b6c3ad2f80..416b67094d 100644
--- a/icu4c/source/tools/genprops/genprops.h
+++ b/icu4c/source/tools/genprops/genprops.h
@@ -25,8 +25,8 @@
 
 /* character properties */
 typedef struct {
-    uint32_t code, lowerCase, upperCase, titleCase;
-    uint32_t decomp[16];
+    uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
+    /* ### uint32_t decomp[16]; */
     uint32_t numericValue, denominator;
     uint8_t generalCategory, canonicalCombining, bidi, isMirrored;
 } Props;
@@ -42,6 +42,9 @@ extern const char *const
 genCategoryNames[];
 
 /* prototypes */
+extern void
+setUnicodeVersion(const char *v);
+
 extern void
 initStore(void);
 
diff --git a/icu4c/source/tools/genprops/store.c b/icu4c/source/tools/genprops/store.c
index b30d8e0202..57a19be0d1 100644
--- a/icu4c/source/tools/genprops/store.c
+++ b/icu4c/source/tools/genprops/store.c
@@ -47,7 +47,7 @@ The following is a description of format version 1.0 .
 Data contents:
 
 The contents is a parsed, binary form of several Unicode character
-database files, mose prominently UnicodeData.txt.
+database files, most prominently UnicodeData.txt.
 
 Any Unicode code point from 0 to 0x10ffff can be looked up to get
 the properties, if any, for that code point. This means that the input
@@ -72,7 +72,7 @@ Formally, the file contains the following structures:
     A1 const uint16_t STAGE_3_BITS(=4);
       (STAGE_1_BITS(=11) not stored, implicitly=21-(STAGE_2_BITS+STAGE_3_BITS))
     A2 const uint16_t exceptionsIndex;  -- 32-bit unit index
-    A3 const uint16_t ucharsIndex;      -- 32-bit unit index
+    A3 const uint16_t reservedIndex;
     A4 const uint16_t reservedIndex;
     A5 const uint16_t reservedIndex;
     A6 const uint16_t reservedIndex;
@@ -84,10 +84,7 @@ Formally, the file contains the following structures:
        (possible 1*uint16_t for padding to 4-alignment)
 
     P  const uint32_t props32[variable size];
-    E  const uint16_t exceptions[variable size];
-       (possible 1*uint16_t for padding to 4-alignment)
-
-    U  const UChar uchars[variable size];
+    E  const uint32_t exceptions[variable size];
 
 3-stage lookup and properties:
 
@@ -124,8 +121,7 @@ arrive at an index into the props32[] table containing the character
 properties for c.
 For some characters, not all of the properties can be efficiently encoded
 using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
-array. Some exception entries, in turn, may contain indexes into the uchars[]
-array of Unicode strings, especially for non-1:1 case mappings.
+array.
 
 The first stage consumes the 11 most significant bits of the 21-bit code point
 and results in an index into the second stage:
@@ -142,28 +138,27 @@ specific value, which itself is only an index into the props32[] table:
 
     uint16_t i=p16[i3+(c&0xf)];
 
+Note that the bit numbers and shifts actually depend on the STAGE_2/3_BITS
+in p16[0..1].
+
 There is finally the 32-bit encoded set of properties for c:
 
     uint32_t props=p32[i];
 
 For some characters, this contains an index into the exceptions array:
 
-    if(props&0x20) {
-        uint16_t e=(uint16_t)(props>>20);
+    if(props&EXCEPTION_BIT)) {
+        uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
         ...
     }
 
-The exception values are a variable number of uint16_t starting at
+The exception values are a variable number of uint32_t starting at
 
-    const uint16_t *pe=p16+2*exceptionsIndex+e;
+    const uint32_t *pe=p32+exceptionsIndex+e;
 
-The first uint16_t there contains flags about what values actually follow it.
-Some of those may be indexes for case mappings or similar and point to strings
-(zero-terminated) in the uchars[] array:
-
-    ...
-    uint16_t u=pe[index depends on pe[0]];
-    const UChar *pu=(const UChar *)(p32+ucharsIndex)+u;
+The first uint32_t there contains flags about what values actually follow it.
+Some of the exception values are UChar32 code points for the case mappings,
+others are numeric values etc.
 
 32-bit properties sets:
 
@@ -171,9 +166,9 @@ Each 32-bit properties word contains:
 
  0.. 4  general category
  5      has exception values
- 6.. 9  BiDi category (the 5 explicit codes stored as one)
-10      is mirrored
-11..19  reserved
+ 6..10  BiDi category
+11      is mirrored
+12..19  reserved
 20..31  value according to bits 0..5:
         if(has exception) {
             exception index;
@@ -181,52 +176,82 @@ Each 32-bit properties word contains:
         case Ll: delta to uppercase; -- same as titlecase
         case Lu: delta to lowercase; -- titlecase is same as c
         case Lt: delta to lowercase; -- uppercase is same as c
-        case Mn: canonical category;
+        case Mn: combining class;
         case N*: numeric value;
-        default: *;
+        default:
+            if(is mirrored) {
+                delta to mirror
+            } else {
+                0
+            };
         }
 
 Exception values:
 
-The first uint16_t word of exception values for a code point contains flags
-that indicate which values follow:
+In the first uint32_t exception word for a code point,
+bits
+31..24  reserved
+23..16  combining class
+15..0   flags that indicate which values follow:
 
+bit
  0      has uppercase mapping
  1      has lowercase mapping
  2      has titlecase mapping
- 3      has canonical category
- 4      has numeric value (numerator)
- 5      has denominator value
+ 3      has numeric value (numerator)
+ 4      has denominator value
+ 5      has a mirror-image Unicode code point
 
-According to the flags in this word, one or more uint16_t words follow it
+According to the flags in this word, one or more uint32_t words follow it
 in the sequence of the bit flags in the flags word; if a flag is not set,
 then the value is missing or 0:
 
-For the case mappings, one uint16_t word each is an index into uchars[],
-pointing to a zero-terminated UChar string for the case mapping.
+For the case mappings and the mirror-image Unicode code point,
+one uint32_t or UChar32 each is the code point.
 
-For the canonical category, the lower 8 bits of a uint16_t word give the
-category value directly. The upper 8 bits are currently reserved.
-
-For the numeric/numerator value, a uint16_t word contains the value directly,
+For the numeric/numerator value, an int32_t word contains the value directly,
 except for when there is no numerator but a denominator, then the numerator
 is 1.
 
-For the denominator value, a uint16_t word contains the value directly.
+For the denominator value, a uint32_t word contains the value directly.
 
 Example:
 U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
 mapping and a numeric value.
-Its exception values would be stored as 3 uint16_t words:
+Its exception values would be stored as 3 uint32_t words:
 
-- flags=0x12 (see above)
-- lowercase index into uchars[]
+- flags=0x0a (see above) with combining class 0
+- lowercase mapping 0x2170
 - numeric value=1
 
 ----------------------------------------------------------------------------- */
 
+/* ### finding an exception value */
+#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1<<(index)))
+
+/* number of bits in an integer value 0..31 */
+static uint8_t flagsOffset[32]={
+    0, 1, 1, 2, 1, 2, 2, 3,
+    1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 5
+};
+
+#define GET_EXCEPTION_OFFSET(flags, index, offset) { \
+    if((index)>=5) { \
+        (offset)+=flagsOffset[(flags)&0x1f]; \
+        (flags)>>=5; \
+        (index)-=5; \
+    } \
+    (offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \
+}
+
+
+
+
+
 /* UDataInfo cf. udata.h */
-static const UDataInfo dataInfo={
+static UDataInfo dataInfo={
     sizeof(UDataInfo),
     0,
 
@@ -262,6 +287,19 @@ enum {
     MAX_STAGE_2_COUNT=MAX_PROPS_COUNT
 };
 
+/* definitions for the properties words */
+enum {
+    EXCEPTION_SHIFT=5,
+    BIDI_SHIFT,
+    MIRROR_SHIFT=BIDI_SHIFT+5,
+    VALUE_SHIFT=20,
+
+    EXCEPTION_BIT=1UL<<EXCEPTION_SHIFT,
+    VALUE_BITS=32-VALUE_SHIFT,
+    MAX_VALUE=(1UL<<(VALUE_BITS-1))-1,
+    MIN_VALUE=-(MAX_VALUE+1)
+};
+
 static uint16_t stage1[STAGE_1_BLOCK], stage2[MAX_STAGE_2_COUNT],
                 stage3[MAX_PROPS_COUNT], map[MAX_PROPS_COUNT];
 
@@ -273,16 +311,14 @@ static uint32_t props[MAX_PROPS_COUNT], props32[MAX_PROPS_COUNT];
 static uint16_t propsTop=STAGE_3_BLOCK; /* the first props[] are always empty */
 
 /* exceptions values */
-static uint16_t exceptions[MAX_EXCEPTIONS_COUNT+20];
+static uint32_t exceptions[MAX_EXCEPTIONS_COUNT+20];
 static uint16_t exceptionsTop=0;
 
 /* Unicode characters, e.g. for special casing or decomposition */
-
 static UChar uchars[MAX_UCHAR_COUNT+20];
 static uint16_t ucharsTop=0;
 
 /* statistics */
-
 static uint16_t exceptionsCount=0;
 
 /* prototypes --------------------------------------------------------------- */
@@ -320,6 +356,38 @@ addUChars(const UChar *s, uint16_t length);
 
 /* -------------------------------------------------------------------------- */
 
+/* ### this must become public in putil.c */
+static void
+__versionFromString(UVersionInfo versionArray, const char *versionString) {
+    char *end;
+    uint16_t part=0;
+
+    if(versionArray==NULL) {
+        return;
+    }
+
+    if(versionString!=NULL) {
+        for(;;) {
+            versionArray[part]=(uint8_t)uprv_strtoul(versionString, &end, 10);
+            if(*end!=U_VERSION_DELIMITER || ++part==U_MAX_VERSION_LENGTH) {
+                break;
+            }
+            versionString=end+1;
+        }
+    }
+
+    while(part<U_MAX_VERSION_LENGTH) {
+        versionArray[part++]=0;
+    }
+}
+
+extern void
+setUnicodeVersion(const char *v) {
+    UVersionInfo version;
+    __versionFromString(version, v);
+    uprv_memcpy(dataInfo.dataVersion, version, 4);
+}
+
 extern void
 initStore() {
     uprv_memset(stage1, 0, sizeof(stage1));
@@ -334,12 +402,6 @@ initStore() {
 
 extern void
 addProps(Props *p) {
-    /* map the explicit BiDi codes to one single value */
-    static const uint8_t bidiMap[U_CHAR_DIRECTION_COUNT]={
-	    0, 1, 2, 3, 4, 5, 6, 7, 8,
-        9, 10, 15, 15, 11, 15, 15, 15, 12, 13
-    };
-
     uint32_t x;
     int32_t value;
     uint16_t count;
@@ -394,7 +456,7 @@ addProps(Props *p) {
         if(!(isMn || isNumber)) {
             value=(int32_t)p->code-(int32_t)p->upperCase;
         } else {
-            x=1<<5;
+            x=EXCEPTION_BIT;
         }
         ++count;
     }
@@ -403,7 +465,7 @@ addProps(Props *p) {
         if(!(isMn || isNumber)) {
             value=(int32_t)p->lowerCase-(int32_t)p->code;
         } else {
-            x=1<<5;
+            x=EXCEPTION_BIT;
         }
         ++count;
     }
@@ -412,7 +474,7 @@ addProps(Props *p) {
         if(!(isMn || isNumber)) {
             value=(int32_t)p->code-(int32_t)p->titleCase;
         } else {
-            x=1<<5;
+            x=EXCEPTION_BIT;
         }
         ++count;
     }
@@ -421,7 +483,7 @@ addProps(Props *p) {
         if(isMn) {
             value=p->canonicalCombining;
         } else {
-            x=1<<5;
+            x=EXCEPTION_BIT;
         }
         ++count;
     }
@@ -430,7 +492,7 @@ addProps(Props *p) {
         if(isNumber) {
             value=p->numericValue;
         } else {
-            x=1<<5;
+            x=EXCEPTION_BIT;
         }
         ++count;
     }
@@ -439,9 +501,15 @@ addProps(Props *p) {
         value=p->denominator;
         ++count;
     }
+    if(p->isMirrored) {
+        if(p->mirrorMapping!=0) {
+            value=(int32_t)p->mirrorMapping-(int32_t)p->code;
+        }
+        ++count;
+    }
 
     /* handle exceptions */
-    if(count>1 || x!=0 || value<-2048 || 2047<value) {
+    if(count>1 || x!=0 || value<MIN_VALUE || MAX_VALUE<value) {
         /* this code point needs exception values */
         if(DO_DEBUG_OUT /* ### beVerbose */) {
             if(x!=0) {
@@ -454,18 +522,58 @@ addProps(Props *p) {
         }
 
         ++exceptionsCount;
-        x=1<<5;
+        x=EXCEPTION_BIT;
 
-        /* ### allocate and create exception values */
-        value=-exceptionsCount;
+        /* allocate and create exception values */
+        value=exceptionsTop;
+        if(value>=4096) {
+            fprintf(stderr, "genprops: out of exceptions memory\n");
+            exit(U_MEMORY_ALLOCATION_ERROR);
+        } else {
+            uint32_t first=(uint32_t)p->canonicalCombining<<16;
+            uint16_t length=1;
+
+            if(p->upperCase!=0) {
+                first|=1;
+                exceptions[value+length++]=p->upperCase;
+            }
+            if(p->lowerCase!=0) {
+                first|=2;
+                exceptions[value+length++]=p->lowerCase;
+            }
+            if(p->upperCase!=p->titleCase) {
+                first|=4;
+                exceptions[value+length++]=p->titleCase;
+            }
+            if(p->denominator==0) {
+                if(p->numericValue!=0) {
+                    first|=8;
+                    exceptions[value+length++]=p->numericValue;
+                }
+            } else {
+                if(p->numericValue!=1) {
+                    first|=8;
+                    exceptions[value+length++]=p->numericValue;
+                }
+                first|=0x10;
+                exceptions[value+length++]=p->denominator;
+            }
+            if(p->isMirrored) {
+                first|=0x20;
+                exceptions[value+length++]=p->mirrorMapping;
+            }
+
+            exceptions[value]=first;
+            exceptionsTop+=length;
+        }
     }
 
     /* put together the 32-bit word of encoded properties */
     x|=
-        p->generalCategory |
-        bidiMap[p->bidi]<<6UL |
-        p->isMirrored<<10UL |
-        (uint32_t)value<<20;
+        (uint32_t)p->generalCategory |
+        (uint32_t)p->bidi<<BIDI_SHIFT |
+        (uint32_t)p->isMirrored<<MIRROR_SHIFT |
+        (uint32_t)value<<VALUE_SHIFT;
 
     setProps(p->code, x, &count, &count, &count);
 
@@ -911,17 +1019,15 @@ generateData(const char *dataDir) {
     }
 
     indexes[2]=offset+=propsTop;            /* uint32_t offset to exceptions[] */
-    indexes[3]=offset+=(exceptionsTop+1)/2; /* uint32_t offset to uchars[], include padding */
 
-    size=4*offset+ucharsTop*U_SIZEOF_UCHAR; /* total size of data */
+    size=4*(offset+exceptionsTop);          /* total size of data */
 
     if(beVerbose) {
         printf("number of stage 2 entries:              %5u\n", stage2Top);
         printf("number of stage 3 entries:              %5u\n", stage3Top);
         printf("number of unique properties values:     %5u\n", propsTop);
         printf("number of code points with exceptions:  %5u\n", exceptionsCount);
-        printf("size in bytes of exceptions:            %5u\n", 2*exceptionsTop);
-        printf("size in bytes of Uchars:                %5u\n", ucharsTop*U_SIZEOF_UCHAR);
+        printf("size in bytes of exceptions:            %5u\n", 4*exceptionsTop);
         printf("data size:                             %6lu\n", size);
     }
 
@@ -939,9 +1045,7 @@ generateData(const char *dataDir) {
     udata_writeBlock(pData, stage3, 2*stage3Top);
     udata_writePadding(pData, (stage2Top+stage3Top)&1);
     udata_writeBlock(pData, props32, 4*propsTop);
-    udata_writeBlock(pData, exceptions, 2*exceptionsTop);
-    udata_writePadding(pData, exceptionsTop&1);
-    udata_writeBlock(pData, uchars, ucharsTop*U_SIZEOF_UCHAR);
+    udata_writeBlock(pData, exceptions, 4*exceptionsTop);
 
     /* finish up */
     dataLength=udata_finish(pData, &errorCode);