ICU-10688 remove break type dependency from dictionaries in break iterators.

X-SVN-Rev: 40688
This commit is contained in:
Andy Heninger 2017-12-04 19:27:48 +00:00
parent 023e8b289f
commit b64c563688
6 changed files with 38 additions and 46 deletions

View File

@ -61,7 +61,6 @@ class BurmeseBreakEngine extends DictionaryBreakEngine {
}
public BurmeseBreakEngine() throws IOException {
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fBurmeseWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Mymr");

View File

@ -38,7 +38,6 @@ class CjkBreakEngine extends DictionaryBreakEngine {
private DictionaryMatcher fDictionary = null;
public CjkBreakEngine(boolean korean) throws IOException {
super(BreakIterator.KIND_WORD);
fDictionary = DictionaryData.loadDictionaryFor("Hira");
if (korean) {
setCharacters(fHangulWordSet);

View File

@ -169,16 +169,11 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
}
UnicodeSet fSet = new UnicodeSet();
private BitSet fTypes = new BitSet(32);
/**
* @param breakTypes The types of break iterators that can use this engine.
* For example, BreakIterator.KIND_LINE
* Constructor
*/
public DictionaryBreakEngine(Integer... breakTypes) {
for (Integer type: breakTypes) {
fTypes.set(type);
}
public DictionaryBreakEngine() {
}
@Override

View File

@ -16,7 +16,7 @@ import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
class KhmerBreakEngine extends DictionaryBreakEngine {
// Constants for KhmerBreakIterator
// How many words in a row are "good enough"?
private static final byte KHMER_LOOKAHEAD = 3;
@ -29,14 +29,14 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
private static final byte KHMER_MIN_WORD = 2;
// Minimum number of characters for two words
private static final byte KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
private DictionaryMatcher fDictionary;
private static UnicodeSet fKhmerWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fMarkSet;
static {
// Initialize UnicodeSets
fKhmerWordSet = new UnicodeSet();
@ -56,42 +56,42 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
fMarkSet.compact();
fEndWordSet.compact();
fBeginWordSet.compact();
// Freeze the static UnicodeSet
fKhmerWordSet.freeze();
fMarkSet.freeze();
fEndWordSet.freeze();
fBeginWordSet.freeze();
}
public KhmerBreakEngine() throws IOException {
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fKhmerWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Khmr");
}
@Override
public boolean equals(Object obj) {
// Normally is a singleton, but it's possible to have duplicates
// during initialization. All are equivalent.
return obj instanceof KhmerBreakEngine;
}
@Override
public int hashCode() {
return getClass().hashCode();
}
public boolean handles(int c, int breakType) {
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.KHMER);
}
return false;
@Override
public boolean handles(int c) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.KHMER);
}
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
return 0; // Not enough characters for word
}
@ -163,7 +163,7 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
(wordLength == 0 ||
(wordLength == 0 ||
words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
int remaining = rangeEnd - (current + wordLength);
@ -209,7 +209,7 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
// resynch continues to function. For example, one of the suffix characters
// resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
// NOT CURRENTLY APPLICABLE TO KHMER

View File

@ -64,7 +64,6 @@ class LaoBreakEngine extends DictionaryBreakEngine {
}
public LaoBreakEngine() throws IOException {
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fLaoWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Laoo");

View File

@ -16,7 +16,7 @@ import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
class ThaiBreakEngine extends DictionaryBreakEngine {
// Constants for ThaiBreakIterator
// How many words in a row are "good enough"?
private static final byte THAI_LOOKAHEAD = 3;
@ -33,14 +33,14 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
private static final byte THAI_MIN_WORD = 2;
// Minimum number of characters for two words
private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
private DictionaryMatcher fDictionary;
private static UnicodeSet fThaiWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fSuffixSet;
private static UnicodeSet fMarkSet;
static {
// Initialize UnicodeSets
fThaiWordSet = new UnicodeSet();
@ -66,7 +66,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
fEndWordSet.compact();
fBeginWordSet.compact();
fSuffixSet.compact();
// Freeze the static UnicodeSet
fThaiWordSet.freeze();
fMarkSet.freeze();
@ -74,32 +74,32 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
fBeginWordSet.freeze();
fSuffixSet.freeze();
}
public ThaiBreakEngine() throws IOException {
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fThaiWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Thai");
}
@Override
public boolean equals(Object obj) {
// Normally is a singleton, but it's possible to have duplicates
// during initialization. All are equivalent.
return obj instanceof ThaiBreakEngine;
}
@Override
public int hashCode() {
return getClass().hashCode();
}
public boolean handles(int c, int breakType) {
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.THAI);
}
return false;
@Override
public boolean handles(int c) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.THAI);
}
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
@ -112,7 +112,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
for (int i = 0; i < THAI_LOOKAHEAD; i++) {
words[i] = new PossibleWord();
}
int uc;
fIter.setIndex(rangeStart);
int current;
@ -156,7 +156,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
}
} while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));
}
}
}
while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter));
// foundBest: end of loop
}
@ -174,7 +174,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
(wordLength == 0 ||
(wordLength == 0 ||
words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
int remaining = rangeEnd - (current + wordLength);
@ -224,7 +224,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
// resynch continues to function. For example, one of the suffix characters
// resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
if (fIter.getIndex() < rangeEnd && wordLength > 0) {
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&