ICU-10688 remove break type dependency from dictionaries in break iterators.
X-SVN-Rev: 40688
This commit is contained in:
parent
023e8b289f
commit
b64c563688
@ -61,7 +61,6 @@ class BurmeseBreakEngine extends DictionaryBreakEngine {
|
||||
}
|
||||
|
||||
public BurmeseBreakEngine() throws IOException {
|
||||
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
|
||||
setCharacters(fBurmeseWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Mymr");
|
||||
|
@ -38,7 +38,6 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
||||
private DictionaryMatcher fDictionary = null;
|
||||
|
||||
public CjkBreakEngine(boolean korean) throws IOException {
|
||||
super(BreakIterator.KIND_WORD);
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Hira");
|
||||
if (korean) {
|
||||
setCharacters(fHangulWordSet);
|
||||
|
@ -169,16 +169,11 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
|
||||
}
|
||||
|
||||
UnicodeSet fSet = new UnicodeSet();
|
||||
private BitSet fTypes = new BitSet(32);
|
||||
|
||||
/**
|
||||
* @param breakTypes The types of break iterators that can use this engine.
|
||||
* For example, BreakIterator.KIND_LINE
|
||||
* Constructor
|
||||
*/
|
||||
public DictionaryBreakEngine(Integer... breakTypes) {
|
||||
for (Integer type: breakTypes) {
|
||||
fTypes.set(type);
|
||||
}
|
||||
public DictionaryBreakEngine() {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -16,7 +16,7 @@ import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
||||
class KhmerBreakEngine extends DictionaryBreakEngine {
|
||||
|
||||
|
||||
// Constants for KhmerBreakIterator
|
||||
// How many words in a row are "good enough"?
|
||||
private static final byte KHMER_LOOKAHEAD = 3;
|
||||
@ -29,14 +29,14 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
|
||||
private static final byte KHMER_MIN_WORD = 2;
|
||||
// Minimum number of characters for two words
|
||||
private static final byte KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
|
||||
|
||||
|
||||
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fKhmerWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
|
||||
|
||||
static {
|
||||
// Initialize UnicodeSets
|
||||
fKhmerWordSet = new UnicodeSet();
|
||||
@ -56,42 +56,42 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
|
||||
fMarkSet.compact();
|
||||
fEndWordSet.compact();
|
||||
fBeginWordSet.compact();
|
||||
|
||||
|
||||
// Freeze the static UnicodeSet
|
||||
fKhmerWordSet.freeze();
|
||||
fMarkSet.freeze();
|
||||
fEndWordSet.freeze();
|
||||
fBeginWordSet.freeze();
|
||||
}
|
||||
|
||||
|
||||
public KhmerBreakEngine() throws IOException {
|
||||
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
|
||||
setCharacters(fKhmerWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Khmr");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
// Normally is a singleton, but it's possible to have duplicates
|
||||
// during initialization. All are equivalent.
|
||||
return obj instanceof KhmerBreakEngine;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getClass().hashCode();
|
||||
}
|
||||
|
||||
public boolean handles(int c, int breakType) {
|
||||
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.KHMER);
|
||||
}
|
||||
return false;
|
||||
|
||||
@Override
|
||||
public boolean handles(int c) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.KHMER);
|
||||
}
|
||||
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
|
||||
|
||||
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
|
||||
return 0; // Not enough characters for word
|
||||
}
|
||||
@ -163,7 +163,7 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
|
||||
// no preceding word, or the non-word shares less than the minimum threshold
|
||||
// of characters with a dictionary word, then scan to resynchronize
|
||||
if (words[wordsFound%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
|
||||
(wordLength == 0 ||
|
||||
(wordLength == 0 ||
|
||||
words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
|
||||
// Look for a plausible word boundary
|
||||
int remaining = rangeEnd - (current + wordLength);
|
||||
@ -209,7 +209,7 @@ class KhmerBreakEngine extends DictionaryBreakEngine {
|
||||
|
||||
// Look ahead for possible suffixes if a dictionary word does not follow.
|
||||
// We do this in code rather than using a rule so that the heuristic
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
// could be a typo in the middle of a word.
|
||||
// NOT CURRENTLY APPLICABLE TO KHMER
|
||||
|
||||
|
@ -64,7 +64,6 @@ class LaoBreakEngine extends DictionaryBreakEngine {
|
||||
}
|
||||
|
||||
public LaoBreakEngine() throws IOException {
|
||||
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
|
||||
setCharacters(fLaoWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Laoo");
|
||||
|
@ -16,7 +16,7 @@ import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
||||
class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
|
||||
|
||||
// Constants for ThaiBreakIterator
|
||||
// How many words in a row are "good enough"?
|
||||
private static final byte THAI_LOOKAHEAD = 3;
|
||||
@ -33,14 +33,14 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
private static final byte THAI_MIN_WORD = 2;
|
||||
// Minimum number of characters for two words
|
||||
private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
|
||||
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fThaiWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fSuffixSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
|
||||
|
||||
static {
|
||||
// Initialize UnicodeSets
|
||||
fThaiWordSet = new UnicodeSet();
|
||||
@ -66,7 +66,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
fEndWordSet.compact();
|
||||
fBeginWordSet.compact();
|
||||
fSuffixSet.compact();
|
||||
|
||||
|
||||
// Freeze the static UnicodeSet
|
||||
fThaiWordSet.freeze();
|
||||
fMarkSet.freeze();
|
||||
@ -74,32 +74,32 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
fBeginWordSet.freeze();
|
||||
fSuffixSet.freeze();
|
||||
}
|
||||
|
||||
|
||||
public ThaiBreakEngine() throws IOException {
|
||||
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
|
||||
setCharacters(fThaiWordSet);
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Thai");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
// Normally is a singleton, but it's possible to have duplicates
|
||||
// during initialization. All are equivalent.
|
||||
return obj instanceof ThaiBreakEngine;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getClass().hashCode();
|
||||
}
|
||||
|
||||
public boolean handles(int c, int breakType) {
|
||||
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.THAI);
|
||||
}
|
||||
return false;
|
||||
|
||||
@Override
|
||||
public boolean handles(int c) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.THAI);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
|
||||
@ -112,7 +112,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
for (int i = 0; i < THAI_LOOKAHEAD; i++) {
|
||||
words[i] = new PossibleWord();
|
||||
}
|
||||
|
||||
|
||||
int uc;
|
||||
fIter.setIndex(rangeStart);
|
||||
int current;
|
||||
@ -156,7 +156,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
}
|
||||
} while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));
|
||||
}
|
||||
}
|
||||
}
|
||||
while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter));
|
||||
// foundBest: end of loop
|
||||
}
|
||||
@ -174,7 +174,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
// no preceding word, or the non-word shares less than the minimum threshold
|
||||
// of characters with a dictionary word, then scan to resynchronize
|
||||
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
|
||||
(wordLength == 0 ||
|
||||
(wordLength == 0 ||
|
||||
words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
|
||||
// Look for a plausible word boundary
|
||||
int remaining = rangeEnd - (current + wordLength);
|
||||
@ -224,7 +224,7 @@ class ThaiBreakEngine extends DictionaryBreakEngine {
|
||||
|
||||
// Look ahead for possible suffixes if a dictionary word does not follow.
|
||||
// We do this in code rather than using a rule so that the heuristic
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
// could be a typo in the middle of a word.
|
||||
if (fIter.getIndex() < rangeEnd && wordLength > 0) {
|
||||
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
|
||||
|
Loading…
Reference in New Issue
Block a user