ICU-7647 Add/use LaoBreakEngine and laodict.txt; more useful messages in gendict

X-SVN-Rev: 34229
2013-09-06 23:43:13 +00:00 · 2013-09-06 23:43:13 +00:00 · bf4126616b
commit bf4126616b
parent 4d9fad13ef
11 changed files with 11609 additions and 67 deletions
--- a/icu4c/source/common/brkeng.cpp
+++ b/icu4c/source/common/brkeng.cpp
@ -1,6 +1,6 @@
 /*
 ************************************************************************************
- * Copyright (C) 2006-2012, International Business Machines Corporation
+ * Copyright (C) 2006-2013, International Business Machines Corporation
 * and others. All Rights Reserved.
 ************************************************************************************
 */
@ -229,6 +229,9 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
            case USCRIPT_THAI:
                engine = new ThaiBreakEngine(m, status);
                break;
+            case USCRIPT_LAO:
+                engine = new LaoBreakEngine(m, status);
+                break;
            case USCRIPT_KHMER:
                engine = new KhmerBreakEngine(m, status);
                break;
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@ -88,10 +88,10 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {

 /*
 ******************************************************************
+ * PossibleWord
 */

-
-// Helper class for improving readability of the Thai word break
+// Helper class for improving readability of the Thai/Lao/Khmer word break
 // algorithm. The implementation is completely inline.

 // List size, limited by the maximum number of words in the dictionary
@ -183,6 +183,11 @@ PossibleWord::markCurrent() {
    mark = current;
 }

+/*
+ ******************************************************************
+ * ThaiBreakEngine
+ */
+
 // How many words in a row are "good enough"?
 #define THAI_LOOKAHEAD 3

@ -415,6 +420,203 @@ foundBest:
    return wordsFound;
 }

+/*
+ ******************************************************************
+ * LaoBreakEngine
+ */
+
+// How many words in a row are "good enough"?
+#define LAO_LOOKAHEAD 3
+
+// Will not combine a non-word with a preceding dictionary word longer than this
+#define LAO_ROOT_COMBINE_THRESHOLD 3
+
+// Will not combine a non-word that shares at least this much prefix with a
+// dictionary word, with a preceding word
+#define LAO_PREFIX_COMBINE_THRESHOLD 3
+
+// Minimum word size
+#define LAO_MIN_WORD 2
+
+// Minimum number of characters for two words
+#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
+
+LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
+    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
+      fDictionary(adoptDictionary)
+{
+    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
+    if (U_SUCCESS(status)) {
+        setCharacters(fLaoWordSet);
+    }
+    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.add(0x0020);
+    fEndWordSet = fLaoWordSet;
+    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
+    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
+    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
+    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
+
+    // Compact for caching.
+    fMarkSet.compact();
+    fEndWordSet.compact();
+    fBeginWordSet.compact();
+}
+
+LaoBreakEngine::~LaoBreakEngine() {
+    delete fDictionary;
+}
+
+int32_t
+LaoBreakEngine::divideUpDictionaryRange( UText *text,
+                                                int32_t rangeStart,
+                                                int32_t rangeEnd,
+                                                UStack &foundBreaks ) const {
+    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
+        return 0;       // Not enough characters for two words
+    }
+
+    uint32_t wordsFound = 0;
+    int32_t wordLength;
+    int32_t current;
+    UErrorCode status = U_ZERO_ERROR;
+    PossibleWord words[LAO_LOOKAHEAD];
+    UChar32 uc;
+    
+    utext_setNativeIndex(text, rangeStart);
+    
+    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
+        wordLength = 0;
+
+        // Look for candidate words at the current position
+        int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+        
+        // If we found exactly one, use that
+        if (candidates == 1) {
+            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
+            wordsFound += 1;
+        }
+        // If there was more than one, see which one can take us forward the most words
+        else if (candidates > 1) {
+            // If we're already at the end of the range, we're done
+            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
+                goto foundBest;
+            }
+            do {
+                int wordsMatched = 1;
+                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
+                    if (wordsMatched < 2) {
+                        // Followed by another dictionary word; mark first word as a good candidate
+                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
+                        wordsMatched = 2;
+                    }
+                    
+                    // If we're already at the end of the range, we're done
+                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
+                        goto foundBest;
+                    }
+                    
+                    // See if any of the possible second words is followed by a third word
+                    do {
+                        // If we find a third word, stop right away
+                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
+                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
+                            goto foundBest;
+                        }
+                    }
+                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
+                }
+            }
+            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
+foundBest:
+            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
+            wordsFound += 1;
+        }
+        
+        // We come here after having either found a word or not. We look ahead to the
+        // next word. If it's not a dictionary word, we will combine it withe the word we
+        // just found (if there is one), but only if the preceding word does not exceed
+        // the threshold.
+        // The text iterator should now be positioned at the end of the word we found.
+        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
+            // if it is a dictionary word, do nothing. If it isn't, then if there is
+            // no preceding word, or the non-word shares less than the minimum threshold
+            // of characters with a dictionary word, then scan to resynchronize
+            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
+                  && (wordLength == 0
+                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
+                // Look for a plausible word boundary
+                //TODO: This section will need a rework for UText.
+                int32_t remaining = rangeEnd - (current+wordLength);
+                UChar32 pc = utext_current32(text);
+                int32_t chars = 0;
+                for (;;) {
+                    utext_next32(text);
+                    uc = utext_current32(text);
+                    // TODO: Here we're counting on the fact that the SA languages are all
+                    // in the BMP. This should get fixed with the UText rework.
+                    chars += 1;
+                    if (--remaining <= 0) {
+                        break;
+                    }
+                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
+                        // Maybe. See if it's in the dictionary.
+                        int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+                        utext_setNativeIndex(text, current + wordLength + chars);
+                        if (candidates > 0) {
+                            break;
+                        }
+                    }
+                    pc = uc;
+                }
+                
+                // Bump the word count if there wasn't already one
+                if (wordLength <= 0) {
+                    wordsFound += 1;
+                }
+                
+                // Update the length with the passed-over characters
+                wordLength += chars;
+            }
+            else {
+                // Back up to where we were for next iteration
+                utext_setNativeIndex(text, current+wordLength);
+            }
+        }
+        
+        // Never stop before a combining mark.
+        int32_t currPos;
+        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
+            utext_next32(text);
+            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
+        }
+        
+        // Look ahead for possible suffixes if a dictionary word does not follow.
+        // We do this in code rather than using a rule so that the heuristic
+        // resynch continues to function. For example, one of the suffix characters
+        // could be a typo in the middle of a word.
+        // NOT CURRENTLY APPLICABLE TO LAO
+
+        // Did we find a word on this iteration? If so, push it on the break stack
+        if (wordLength > 0) {
+            foundBreaks.push((current+wordLength), status);
+        }
+    }
+
+    // Don't return a break for the end of the dictionary range if there is one there.
+    if (foundBreaks.peeki() >= rangeEnd) {
+        (void) foundBreaks.popi();
+        wordsFound -= 1;
+    }
+
+    return wordsFound;
+}
+
+/*
+ ******************************************************************
+ * KhmerBreakEngine
+ */
+
 // How many words in a row are "good enough"?
 #define KHMER_LOOKAHEAD 3

--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@ -1,6 +1,6 @@
 /**
 *******************************************************************************
- * Copyright (C) 2006,2012, International Business Machines Corporation        *
+ * Copyright (C) 2006,2012-2013, International Business Machines Corporation   *
 * and others. All Rights Reserved.                                            *
 *******************************************************************************
 */
@ -186,6 +186,118 @@ class ThaiBreakEngine : public DictionaryBreakEngine {

 };

+/*******************************************************************
+ * LaoBreakEngine
+ */
+
+/**
+ * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * dictionary and heuristics to determine Lao-specific breaks.</p>
+ *
+ * <p>After it is constructed a LaoBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class LaoBreakEngine : public DictionaryBreakEngine {
+ private:
+    /**
+     * The set of characters handled by this engine
+     * @internal
+     */
+
+  UnicodeSet                fLaoWordSet;
+  UnicodeSet                fEndWordSet;
+  UnicodeSet                fBeginWordSet;
+  UnicodeSet                fMarkSet;
+  DictionaryMatcher  *fDictionary;
+
+ public:
+
+  /**
+   * <p>Default constructor.</p>
+   *
+   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+   * engine is deleted.
+   */
+  LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+  /**
+   * <p>Virtual destructor.</p>
+   */
+  virtual ~LaoBreakEngine();
+
+ protected:
+ /**
+  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
+  *
+  * @param text A UText representing the text
+  * @param rangeStart The start of the range of dictionary characters
+  * @param rangeEnd The end of the range of dictionary characters
+  * @param foundBreaks Output of C array of int32_t break positions, or 0
+  * @return The number of breaks found
+  */
+  virtual int32_t divideUpDictionaryRange( UText *text,
+                                           int32_t rangeStart,
+                                           int32_t rangeEnd,
+                                           UStack &foundBreaks ) const;
+
+};
+
+/******************************************************************* 
+ * KhmerBreakEngine 
+ */ 
+ 
+/** 
+ * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 
+ * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 
+ * 
+ * <p>After it is constructed a KhmerBreakEngine may be shared between 
+ * threads without synchronization.</p> 
+ */ 
+class KhmerBreakEngine : public DictionaryBreakEngine { 
+ private: 
+    /** 
+     * The set of characters handled by this engine 
+     * @internal 
+     */ 
+ 
+  UnicodeSet                fKhmerWordSet; 
+  UnicodeSet                fEndWordSet; 
+  UnicodeSet                fBeginWordSet; 
+  UnicodeSet                fMarkSet; 
+  DictionaryMatcher  *fDictionary; 
+ 
+ public: 
+ 
+  /** 
+   * <p>Default constructor.</p> 
+   * 
+   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 
+   * engine is deleted. 
+   */ 
+  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 
+ 
+  /** 
+   * <p>Virtual destructor.</p> 
+   */ 
+  virtual ~KhmerBreakEngine(); 
+ 
+ protected: 
+ /** 
+  * <p>Divide up a range of known dictionary characters.</p> 
+  * 
+  * @param text A UText representing the text 
+  * @param rangeStart The start of the range of dictionary characters 
+  * @param rangeEnd The end of the range of dictionary characters 
+  * @param foundBreaks Output of C array of int32_t break positions, or 0 
+  * @return The number of breaks found 
+  */ 
+  virtual int32_t divideUpDictionaryRange( UText *text, 
+                                           int32_t rangeStart, 
+                                           int32_t rangeEnd, 
+                                           UStack &foundBreaks ) const; 
+ 
+}; 
+ 
 #if !UCONFIG_NO_NORMALIZATION

 /*******************************************************************
@ -251,63 +363,6 @@ class CjkBreakEngine : public DictionaryBreakEngine {

 #endif

-/******************************************************************* 
- * KhmerBreakEngine 
- */ 
- 
-/** 
- * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 
- * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 
- * 
- * <p>After it is constructed a KhmerBreakEngine may be shared between 
- * threads without synchronization.</p> 
- */ 
-class KhmerBreakEngine : public DictionaryBreakEngine { 
- private: 
-    /** 
-     * The set of characters handled by this engine 
-     * @internal 
-     */ 
- 
-  UnicodeSet                fKhmerWordSet; 
-  UnicodeSet                fEndWordSet; 
-  UnicodeSet                fBeginWordSet; 
-  UnicodeSet                fMarkSet; 
-  DictionaryMatcher  *fDictionary; 
- 
- public: 
- 
-  /** 
-   * <p>Default constructor.</p> 
-   * 
-   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 
-   * engine is deleted. 
-   */ 
-  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 
- 
-  /** 
-   * <p>Virtual destructor.</p> 
-   */ 
-  virtual ~KhmerBreakEngine(); 
- 
- protected: 
- /** 
-  * <p>Divide up a range of known dictionary characters.</p> 
-  * 
-  * @param text A UText representing the text 
-  * @param rangeStart The start of the range of dictionary characters 
-  * @param rangeEnd The end of the range of dictionary characters 
-  * @param foundBreaks Output of C array of int32_t break positions, or 0 
-  * @return The number of breaks found 
-  */ 
-  virtual int32_t divideUpDictionaryRange( UText *text, 
-                                           int32_t rangeStart, 
-                                           int32_t rangeEnd, 
-                                           UStack &foundBreaks ) const; 
- 
-}; 
- 
- 
 U_NAMESPACE_END

    /* DICTBE_H */
--- a/icu4c/source/data/Makefile.in
+++ b/icu4c/source/data/Makefile.in
@ -1,5 +1,5 @@
 ## Makefile.in for ICU data
-## Copyright (c) 1999-2012, International Business Machines Corporation and
+## Copyright (c) 1999-2013, International Business Machines Corporation and
 ## others. All Rights Reserved.

 ## Source directory information
@ -527,6 +527,9 @@ $(BRKBLDDIR)/%.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
 $(BRKBLDDIR)/thaidict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
 	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x0e00 -c -i $(BUILDDIR) $(BRKSRCDIR)/thaidict.txt $(BRKBLDDIR)/thaidict.dict

+$(BRKBLDDIR)/laodict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
+	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x0e80 -c -i $(BUILDDIR) $(BRKSRCDIR)/laodict.txt $(BRKBLDDIR)/laodict.dict
+
 # TODO: figure out why combining characters are here?
 $(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
 	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(BRKSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
--- a/icu4c/source/data/brkitr/brkfiles.mk
+++ b/icu4c/source/data/brkitr/brkfiles.mk
@ -34,7 +34,7 @@ BRK_RES_ALIAS_SOURCE = $(BRK_RES_SYNTHETIC_ALIAS)


 # List of dictionary files (dict).
-BRK_DICT_SOURCE = cjdict.txt khmerdict.txt thaidict.txt
+BRK_DICT_SOURCE = cjdict.txt khmerdict.txt laodict.txt thaidict.txt


 # List of break iterator files (brk).
--- a/icu4c/source/data/brkitr/laodict.txt
+++ b/icu4c/source/data/brkitr/laodict.txt
--- a/icu4c/source/data/brkitr/root.txt
+++ b/icu4c/source/data/brkitr/root.txt
@ -20,6 +20,7 @@ root{
        Hira:process(dependency){"cjdict.dict"}
        Kata:process(dependency){"cjdict.dict"}
        Khmr:process(dependency){"khmerdict.dict"}
+        Laoo:process(dependency){"laodict.dict"}
        Thai:process(dependency){"thaidict.dict"}
    }
 }
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@ -743,6 +743,10 @@ $(ICUBRK)\thaidict.dict:
 	@echo Creating $(ICUBRK)\thaidict.dict
 	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0xe00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\thaidict.txt "$(ICUBLD_PKG)\$(ICUBRK)\thaidict.dict"

+$(ICUBRK)\laodict.dict:
+	@echo Creating $(ICUBRK)\laodict.dict
+	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0xe00 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\laodict.txt "$(ICUBLD_PKG)\$(ICUBRK)\laodict.dict"
+
 $(ICUBRK)\khmerdict.dict:
 	@echo Creating $(ICUBRK)\khmerdict.dict
 	@"$(ICUTOOLS)\gendict\$(CFG)\gendict" -c --bytes --transform offset-0x1780 $(ICUSRCDATA_RELATIVE_PATH)\$(ICUBRK)\khmerdict.txt "$(ICUBLD_PKG)\$(ICUBRK)\khmerdict.dict"
--- a/icu4c/source/data/xml/brkitr/root.xml
+++ b/icu4c/source/data/xml/brkitr/root.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!--
- Copyright (c) 2010-2012 International Business Machines Corporation and others. All rights reserved.
+ Copyright (c) 2010-2013 International Business Machines Corporation and others. All rights reserved.
 -->
 <!DOCTYPE ldml SYSTEM "http://www.unicode.org/repos/cldr/trunk/common/dtd/ldml.dtd"
 [
@ -25,6 +25,7 @@
            </icu:boundaries>
            <icu:dictionaries>
                <icu:dictionary type="Thai" icu:dependency="thaidict.dict"/>
+                <icu:dictionary type="Laoo" icu:dependency="laodict.dict"/>
                <icu:dictionary type="Khmr" icu:dependency="khmerdict.dict"/>
                <icu:dictionary type="Hani" icu:dependency="cjdict.dict"/>
                <icu:dictionary type="Hira" icu:dependency="cjdict.dict"/>
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -691,6 +691,19 @@ Bangkok)•</data>
 <data>•ใช•มั้ย•</data>
 <data>•มั๊ยล่ะ•ที่รัก•</data>

+##########################################################################################
+#
+#   Lao Tests
+#
+##########################################################################################
+<locale en>
+# Basic check for #7647
+<line>
+<data>•ສະບາຍດີ•</data>
+<data>•ດີ•ຂອບໃຈ•</data>
+<data>•ເຈົ້າ•ເວົ້າ•ພາສາ•ອັງກິດ•ໄດ້•ບໍ່•</data>
+<data>•ກະລຸນາ•ເວົ້າ•ຊ້າ•ໆ•</data>
+
 ##########################################################################################
 #
 #   Khmer Tests
--- a/icu4c/source/tools/gendict/gendict.cpp
+++ b/icu4c/source/tools/gendict/gendict.cpp
@ -365,6 +365,9 @@ int  main(int argc, char **argv) {
    UBool hasValues = FALSE;
    UBool hasValuelessContents = FALSE;
    int lineCount = 0;
+    int wordCount = 0;
+    int minlen = 255;
+    int maxlen = 0;
    UBool isOk = TRUE;
    while (readLine(f, fileLine, status)) {
        lineCount++;
@ -401,9 +404,15 @@ int  main(int argc, char **argv) {
            }
            dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status);
            hasValues = TRUE;
+            wordCount++;
+            if (keyLen < minlen) minlen = keyLen;
+            if (keyLen > maxlen) maxlen = keyLen;
        } else {
            dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
-            hasValuelessContents = FALSE;
+            hasValuelessContents = TRUE;
+            wordCount++;
+            if (keyLen < minlen) minlen = keyLen;
+            if (keyLen > maxlen) maxlen = keyLen;
        }

        if (status.isFailure()) {
@ -412,6 +421,7 @@ int  main(int argc, char **argv) {
            exit(status.reset());
        }
    }
+    if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); }

    if (!isOk && status.isSuccess()) {
        status.set(U_ILLEGAL_ARGUMENT_ERROR);
@ -420,7 +430,7 @@ int  main(int argc, char **argv) {
        fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
    }

-    if (verbose) { puts("Serializing data..."); }
+    if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); }
    int32_t outDataSize;
    const void *outData;
    UnicodeString usp;
@ -434,7 +444,7 @@ int  main(int argc, char **argv) {
        outData = usp.getBuffer();
    }
    if (status.isFailure()) {
-        fprintf(stderr, "gendict: got failure of type %s while serializing\n", status.errorName());
+        fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName());
        exit(status.reset());
    }
    if (verbose) { puts("Opening output file..."); }