ICU-9353 merge dbbi-tries work into the trunk

X-SVN-Rev: 32185
This commit is contained in:
Maxime Serrano 2012-08-16 23:16:04 +00:00
parent c64c0299d7
commit ed2c14b425
26 changed files with 1372 additions and 1271 deletions

View File

@ -1583,7 +1583,7 @@
<include name="**/pnames.icu"/>
<include name="**/*.res"/>
<include name="**/*.brk"/>
<include name="**/*.ctd"/>
<include name="**/*.dict"/>
<include name="**/*.nrm"/>
<exclude name="**/coll/*.res"/>
<exclude name="**/translit/*.res"/>
@ -1676,7 +1676,7 @@
<include name="**/unames.icu"/>
<include name="**/pnames.icu"/>
<include name="**/*.brk"/>
<include name="**/*.ctd"/>
<include name="**/*.dict"/>
<include name="**/*.nrm"/>
<include name="**/brkitr/*.res"/>
<include name="**/translit/*.res"/>

View File

@ -0,0 +1,126 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.text.CharacterIterator;
import com.ibm.icu.text.UTF16;
public final class CharacterIteration {
// disallow instantiation
private CharacterIteration() { }
// 32 bit Char value returned from when an iterator has run out of range.
// Positive value so fast case (not end, not surrogate) can be checked
// with a single test.
public static int DONE32 = 0x7fffffff;
/**
* Move the iterator forward to the next code point, and return that code point,
* leaving the iterator positioned at char returned.
* For Supplementary chars, the iterator is left positioned at the lead surrogate.
* @param ci The character iterator
* @return The next code point.
*/
public static int next32(CharacterIterator ci) {
// If the current position is at a surrogate pair, move to the trail surrogate
// which leaves it in positon for underlying iterator's next() to work.
int c= ci.current();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
c = ci.next();
if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
c = ci.previous();
}
}
// For BMP chars, this next() is the real deal.
c = ci.next();
// If we might have a lead surrogate, we need to peak ahead to get the trail
// even though we don't want to really be positioned there.
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(ci, c);
}
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) {
// We got a supplementary char. Back the iterator up to the postion
// of the lead surrogate.
ci.previous();
}
return c;
}
// Out-of-line portion of the in-line Next32 code.
// The call site does an initial ci.next() and calls this function
// if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
// NOTE: we leave the underlying char iterator positioned in the
// middle of a surroage pair. ci.next() will work correctly
// from there, but the ci.getIndex() will be wrong, and needs
// adjustment.
public static int nextTrail32(CharacterIterator ci, int lead) {
int retVal = lead;
if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
char cTrail = ci.next();
if (UTF16.isTrailSurrogate(cTrail)) {
retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
(cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
UTF16.SUPPLEMENTARY_MIN_VALUE;
} else {
ci.previous();
}
} else {
if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
retVal = DONE32;
}
}
return retVal;
}
public static int previous32(CharacterIterator ci) {
if (ci.getIndex() <= ci.getBeginIndex()) {
return DONE32;
}
char trail = ci.previous();
int retVal = trail;
if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
char lead = ci.previous();
if (UTF16.isLeadSurrogate(lead)) {
retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
UTF16.SUPPLEMENTARY_MIN_VALUE;
} else {
ci.next();
}
}
return retVal;
}
public static int current32(CharacterIterator ci) {
char lead = ci.current();
int retVal = lead;
if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
return retVal;
}
if (UTF16.isLeadSurrogate(lead)) {
int trail = (int)ci.next();
ci.previous();
if (UTF16.isTrailSurrogate((char)trail)) {
retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
(trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
UTF16.SUPPLEMENTARY_MIN_VALUE;
}
} else {
if (lead == CharacterIterator.DONE) {
if (ci.getIndex() >= ci.getEndIndex()) {
retVal = DONE32;
}
}
}
return retVal;
}
}

View File

@ -732,6 +732,11 @@ s */
BreakIteratorCache cache = new BreakIteratorCache(where, result);
iterCache[kind] = new SoftReference<BreakIteratorCache>(cache);
if (result instanceof RuleBasedBreakIterator) {
RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
rbbi.setBreakType(kind);
}
return result;
}

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2002-2010, International Business Machines Corporation and *
* Copyright (C) 2002-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -90,28 +90,20 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
* pre-compiled break rules. The resource bundle name is "boundaries".
* The value for each key will be the rules to be used for the
* specified locale - "word" -> "word_th" for Thai, for example.
* DICTIONARY_POSSIBLE indexes in the same way, and indicates whether a
* dictionary is a possibility for that type of break. This is just
* an optimization to avoid a resource lookup where no dictionary is
* ever possible.
*/
private static final String[] KIND_NAMES = {
"grapheme", "word", "line", "sentence", "title"
};
private static final boolean[] DICTIONARY_POSSIBLE = {
false, true, true, false, false
};
private static BreakIterator createBreakInstance(ULocale locale, int kind) {
BreakIterator iter = null;
ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
RuleBasedBreakIterator iter = null;
ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
//
// Get the binary rules. These are needed for both normal RulesBasedBreakIterators
// and for Dictionary iterators.
//
// Get the binary rules.
//
InputStream ruleStream = null;
try {
String typeKey = KIND_NAMES[kind];
@ -122,51 +114,22 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
catch (Exception e) {
throw new MissingResourceException(e.toString(),"","");
}
//
// Check whether a dictionary exists, and create a DBBI iterator is
// one does.
//
if (DICTIONARY_POSSIBLE[kind]) {
// This type of break iterator could potentially use a dictionary.
//
try {
if (locale.getLanguage().equals("th")){
// If the language is Thai, load the thai compact trie dictionary.
String dictType = "Thai";
String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
InputStream is = ICUData.getStream(dictFileName);
iter = new ThaiBreakIterator(ruleStream, is);
}
} catch (MissingResourceException e) {
// Couldn't find a dictionary.
// This is normal, and will occur whenever creating a word or line
// break iterator for a locale that does not have a BreakDictionaryData
// resource - meaning for all but Thai.
// Fall through to creating a normal RulebasedBreakIterator.
} catch (IOException e) {
Assert.fail(e);
}
}
if (iter == null) {
//
// Create a normal RuleBasedBreakIterator.
// We have determined that this is not supposed to be a dictionary iterator.
//
try {
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
}
catch (IOException e) {
// Shouldn't be possible to get here.
// If it happens, the compiled rules are probably corrupted in some way.
Assert.fail(e);
}
//
// Create a normal RuleBasedBreakIterator.
//
try {
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
}
catch (IOException e) {
// Shouldn't be possible to get here.
// If it happens, the compiled rules are probably corrupted in some way.
Assert.fail(e);
}
// TODO: Determine valid and actual locale correctly.
ULocale uloc = ULocale.forLocale(rb.getLocale());
iter.setLocale(uloc, uloc);
iter.setBreakType(kind);
return iter;

View File

@ -0,0 +1,83 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.util.BytesTrie;
import com.ibm.icu.util.BytesTrie.Result;
class BytesDictionaryMatcher extends DictionaryMatcher {
private final byte[] characters;
private final int transform;
public BytesDictionaryMatcher(byte[] chars, int transform) {
characters = chars;
Assert.assrt((transform & DictionaryData.TRANSFORM_TYPE_MASK) == DictionaryData.TRANSFORM_TYPE_OFFSET);
// while there is only one transform type so far, save the entire transform constant so that
// if we add any others, we need only change code in transform() and the assert above rather
// than adding a "transform type" variable
this.transform = transform;
}
private int transform(int c) {
if (c == 0x200D) {
return 0xFF;
} else if (c == 0x200C) {
return 0xFE;
}
int delta = c - (transform & DictionaryData.TRANSFORM_OFFSET_MASK);
if (delta < 0 || 0xFD < delta) {
return -1;
}
return delta;
}
public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
UCharacterIterator text = UCharacterIterator.getInstance(text_);
BytesTrie bt = new BytesTrie(characters, 0);
int c = text.nextCodePoint();
Result result = bt.first(transform(c));
// TODO: should numChars count Character.charCount() ?
int numChars = 1;
int count = 0;
for (;;) {
if (result.hasValue()) {
if (count < limit) {
if (values != null) {
values[count] = bt.getValue();
}
lengths[count] = numChars;
count++;
}
if (result == Result.FINAL_VALUE) {
break;
}
} else if (result == Result.NO_MATCH) {
break;
}
if (numChars >= maxLength) {
break;
}
c = text.nextCodePoint();
++numChars;
result = bt.next(transform(c));
}
count_[0] = count;
return numChars;
}
public int getType() {
return DictionaryData.TRIE_TYPE_BYTES;
}
}

View File

@ -0,0 +1,61 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import com.ibm.icu.util.BytesTrie.Result;
import com.ibm.icu.util.CharsTrie;
class CharsDictionaryMatcher extends DictionaryMatcher {
private CharSequence characters;
public CharsDictionaryMatcher(CharSequence chars) {
characters = chars;
}
public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
UCharacterIterator text = UCharacterIterator.getInstance(text_);
CharsTrie uct = new CharsTrie(characters, 0);
int c = text.nextCodePoint();
Result result = uct.firstForCodePoint(c);
// TODO: should numChars count Character.charCount?
int numChars = 1;
int count = 0;
for (;;) {
if (result.hasValue()) {
if (count < limit) {
if (values != null) {
values[count] = uct.getValue();
}
lengths[count] = numChars;
count++;
}
if (result == Result.FINAL_VALUE) {
break;
}
} else if (result == Result.NO_MATCH) {
break;
}
if (numChars >= maxLength) {
break;
}
c = text.nextCodePoint();
++numChars;
result = uct.nextForCodePoint(c);
}
count_[0] = count;
return numChars;
}
public int getType() {
return DictionaryData.TRIE_TYPE_UCHARS;
}
}

View File

@ -0,0 +1,218 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.IOException;
import java.text.CharacterIterator;
import java.util.Stack;
import com.ibm.icu.impl.Assert;
import static com.ibm.icu.impl.CharacterIteration.*;
public class CjkBreakEngine implements LanguageBreakEngine {
private static final UnicodeSet fHangulWordSet = new UnicodeSet();
private static final UnicodeSet fHanWordSet = new UnicodeSet();
private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
static {
fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
fHanWordSet.applyPattern("[:Han:]");
fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
fHiraganaWordSet.applyPattern("[:Hiragana:]");
// freeze them all
fHangulWordSet.freeze();
fHanWordSet.freeze();
fKatakanaWordSet.freeze();
fHiraganaWordSet.freeze();
}
private final UnicodeSet fWordSet;
private DictionaryMatcher fDictionary = null;
public CjkBreakEngine(boolean korean) throws IOException {
fDictionary = DictionaryData.loadDictionaryFor("Hira");
if (korean) {
fWordSet = fHangulWordSet;
} else {
fWordSet = new UnicodeSet();
fWordSet.addAll(fHanWordSet);
fWordSet.addAll(fKatakanaWordSet);
fWordSet.addAll(fHiraganaWordSet);
fWordSet.add("\\uff70\\u30fc");
}
}
public boolean handles(int c, int breakType) {
return (breakType == BreakIterator.KIND_WORD) &&
(fWordSet.contains(c));
}
private static final int kMaxKatakanaLength = 8;
private static final int kMaxKatakanaGroupLength = 20;
private static final int maxSnlp = 255;
private static final int kint32max = Integer.MAX_VALUE;
private static int getKatakanaCost(int wordlength) {
int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
}
private static boolean isKatakana(int value) {
return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
(value >= 0xFF66 && value <= 0xFF9F);
}
public int findBreaks(CharacterIterator inText, int startPos, int endPos,
boolean reverse, int breakType, Stack<Integer> foundBreaks) {
if (startPos >= endPos) {
return 0;
}
inText.setIndex(startPos);
int inputLength = endPos - startPos;
int[] charPositions = new int[inputLength + 1];
StringBuffer s = new StringBuffer("");
inText.setIndex(startPos);
while (inText.getIndex() < endPos) {
s.append(inText.current());
inText.next();
}
String prenormstr = s.toString();
boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
CharacterIterator text = inText;
int numChars = 0;
if (isNormalized) {
int index = 0;
charPositions[0] = 0;
while (index < prenormstr.length()) {
int codepoint = prenormstr.codePointAt(index);
index += Character.charCount(codepoint);
numChars++;
charPositions[numChars] = index;
}
} else {
String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
text = new java.text.StringCharacterIterator(normStr);
charPositions = new int[normStr.length() + 1];
Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
int index = 0;
charPositions[0] = 0;
while (index < normalizer.endIndex()) {
normalizer.next();
numChars++;
index = normalizer.getIndex();
charPositions[numChars] = index;
}
}
// From here on out, do the algorithm. Note that our indices
// refer to indices within the normalized string.
int[] bestSnlp = new int[numChars + 1];
bestSnlp[0] = 0;
for (int i = 1; i <= numChars; i++) {
bestSnlp[i] = kint32max;
}
int[] prev = new int[numChars + 1];
for (int i = 0; i <= numChars; i++) {
prev[i] = -1;
}
final int maxWordSize = 20;
int values[] = new int[numChars];
int lengths[] = new int[numChars];
// dynamic programming to find the best segmentation
boolean is_prev_katakana = false;
for (int i = 0; i < numChars; i++) {
text.setIndex(i);
if (bestSnlp[i] == kint32max) {
continue;
}
int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
int[] count_ = new int[1];
fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
int count = count_[0];
// if there are no single character matches found in the dictionary
// starting with this character, treat character as a 1-character word
// with the highest value possible (i.e. the least likely to occur).
// Exclude Korean characters from this treatment, as they should be
// left together by default.
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
values[count] = maxSnlp;
lengths[count] = 1;
count++;
}
for (int j = 0; j < count; j++) {
int newSnlp = bestSnlp[i] + values[j];
if (newSnlp < bestSnlp[lengths[j] + i]) {
bestSnlp[lengths[j] + i] = newSnlp;
prev[lengths[j] + i] = i;
}
}
// In Japanese, single-character Katakana words are pretty rare.
// So we apply the following heuristic to Katakana: any continuous
// run of Katakana characters is considered a candidate word with
// a default cost specified in the katakanaCost table according
// to its length.
text.setIndex(i);
boolean is_katakana = isKatakana(current32(text));
if (!is_prev_katakana && is_katakana) {
int j = i + 1;
next32(text);
while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
next32(text);
++j;
}
if ((j - i) < kMaxKatakanaGroupLength) {
int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
if (newSnlp < bestSnlp[j]) {
bestSnlp[j] = newSnlp;
prev[j] = i;
}
}
}
is_prev_katakana = is_katakana;
}
int t_boundary[] = new int[numChars + 1];
int numBreaks = 0;
if (bestSnlp[numChars] == kint32max) {
t_boundary[numBreaks] = numChars;
numBreaks++;
} else {
for (int i = numChars; i > 0; i = prev[i]) {
t_boundary[numBreaks] = i;
numBreaks++;
}
Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
}
if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
t_boundary[numBreaks++] = 0;
}
for (int i = numBreaks - 1; i >= 0; i--) {
int pos = charPositions[t_boundary[i]] + startPos;
if (!(foundBreaks.contains(pos) || pos == startPos))
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
}
if (!foundBreaks.empty() && foundBreaks.peek() == endPos)
foundBreaks.pop();
if (!foundBreaks.empty())
inText.setIndex(foundBreaks.peek());
return 0;
}
}

View File

@ -1,565 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.IOException;
import java.io.InputStream;
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import com.ibm.icu.impl.Assert;
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
* word and line breaking in Thai, which doesn't use spaces between words. The
* state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
* up text as far as possible, and then contiguous ranges of letters are
* repeatedly compared against a list of known words (i.e., the dictionary)
* to divide them up into words.
*
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
* but adds one more special substitution name: _dictionary_. This substitution
* name is used to identify characters in words in the dictionary. The idea is that
* if the iterator passes over a chunk of text that includes two or more characters
* in a row that are included in _dictionary_, it goes back through that range and
* derives additional break positions (if possible) using the dictionary.
*
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
* file. It uses Class.getResource() to locate the dictionary file. The
* dictionary file is in a serialized binary format. We have a very primitive (and
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
* currently making it public. Contact us for help.
*
* @stable ICU 2.0
*/
public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
/**
* Keeps track of if we are using the compact trie dictionary.
*/
private boolean usingCTDictionary = false;
/**
* a list of known words that is used to divide up contiguous ranges of letters,
* stored in a compressed, indexed, format that offers fast access
*/
private BreakDictionary dictionary;
/*
* a list of flags indicating which character categories are contained in
* the dictionary file (this is used to determine which ranges of characters
* to apply the dictionary to)
*/
//private boolean[] categoryFlags;
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text
*/
int[] cachedBreakPositions;
/**
* if cachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
*/
int positionInCache;
/**
* Special variable name for characters in words in dictionary
*/
/**
* Construct a DictionarBasedBreakIterator from precompiled rules. Use by ThaiBreakEngine
* uses the BreakCTDictionary.
* @param compiledRules an input stream containing the binary (flattened) compiled rules.
* @internal
* @deprecated This API is ICU internal only.
*/
protected DictionaryBasedBreakIterator(InputStream compiledRules) throws IOException {
fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
dictionary = null;
usingCTDictionary = true;
}
/**
* Constructs a DictionaryBasedBreakIterator.
* @param rules Same as the rules parameter on RuleBasedBreakIterator,
* except for the special meaning of "_dictionary_". This parameter is just
* passed through to RuleBasedBreakIterator constructor.
* @param dictionaryStream the stream containing the dictionary data
* @stable ICU 2.0
*/
public DictionaryBasedBreakIterator(String rules,
InputStream dictionaryStream) throws IOException {
super(rules);
dictionary = new BreakDictionary(dictionaryStream);
}
/**
* Construct a DictionarBasedBreakIterator from precompiled rules.
* @param compiledRules an input stream containing the binary (flattened) compiled rules.
* @param dictionaryStream an input stream containing the dictionary data
* @internal
* @deprecated This API is ICU internal only.
*/
public DictionaryBasedBreakIterator(InputStream compiledRules,
InputStream dictionaryStream) throws IOException {
fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
dictionary = new BreakDictionary(dictionaryStream);
}
/** @stable ICU 2.0 */
public void setText(CharacterIterator newText) {
super.setText(newText);
cachedBreakPositions = null;
fDictionaryCharCount = 0;
positionInCache = 0;
}
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
* @stable ICU 2.0
*/
public int first() {
cachedBreakPositions = null;
fDictionaryCharCount = 0;
positionInCache = 0;
return super.first();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
* @stable ICU 2.0
*/
public int last() {
cachedBreakPositions = null;
fDictionaryCharCount = 0;
positionInCache = 0;
return super.last();
}
/**
* Advances the iterator one step backwards.
* @return The position of the last boundary position before the
* current iteration position
* @stable ICU 2.0
*/
public int previous() {
CharacterIterator text = getText();
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (cachedBreakPositions != null && positionInCache > 0) {
--positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
// otherwise, dump the cache and use the inherited previous() method to move
// backward. This may fill up the cache with new break positions, in which
// case we have to mark our position in the cache. If it doesn't, use next()
// to move forward until we hit or pass the current position. This *will* fill
// the cache.
else {
cachedBreakPositions = null;
int offset = current();
int result = super.previous();
if (cachedBreakPositions != null) {
positionInCache = cachedBreakPositions.length - 2;
return result;
}
while (result < offset) {
int nextResult = next();
if (nextResult >= offset) {
break;
}
result = nextResult;
}
if (cachedBreakPositions != null) {
positionInCache = cachedBreakPositions.length - 2;
}
if (result != BreakIterator.DONE) {
text.setIndex(result);
}
return result;
}
}
/**
* Sets the current iteration position to the last boundary position
* before the specified position.
* @param offset The position to begin searching from
* @return The position of the last boundary before "offset"
* @stable ICU 2.0
*/
public int preceding(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or "offset" is outside the
// range covered by the cache, we can just call the inherited routine
// (which will eventually call other routines in this class that may
// refresh the cache)
if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
cachedBreakPositions = null;
return super.preceding(offset);
}
// on the other hand, if "offset" is within the range covered by the cache,
// then all we have to do is search the cache for the last break position
// before "offset"
else {
positionInCache = 0;
while (positionInCache < cachedBreakPositions.length
&& offset > cachedBreakPositions[positionInCache])
++positionInCache;
--positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return text.getIndex();
}
}
/**
* Sets the current iteration position to the first boundary position after
* the specified position.
* @param offset The position to begin searching forward from
* @return The position of the first boundary after "offset"
* @stable ICU 2.0
*/
public int following(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or if "offset" is outside the
// range covered by the cache, then dump the cache and call our
// inherited following() method. This will call other methods in this
// class that may refresh the cache.
if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
cachedBreakPositions = null;
return super.following(offset);
}
// on the other hand, if "offset" is within the range covered by the
// cache, then just search the cache for the first break position
// after "offset"
else {
positionInCache = 0;
while (positionInCache < cachedBreakPositions.length
&& offset >= cachedBreakPositions[positionInCache])
++positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return text.getIndex();
}
}
/**
* Return the status tag from the break rule that determined the most recently
* returned break position.
*
* TODO: not supported with dictionary based break iterators.
*
* @return the status from the break rule that determined the most recently
* returned break position.
* @draft ICU 3.0
* @provisional This API might change or be removed in a future release.
*/
public int getRuleStatus() {
return 0;
}
/**
* Get the status (tag) values from the break rule(s) that determined the most
* recently returned break position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
* <p>
* TODO: not supported for dictionary based break iterator.
*
* @param fillInArray an array to be filled in with the status values.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
* In the event that the array is too small, the return value
* is the total number of status values that were available,
* not the reduced number that were actually returned.
* @draft ICU 3.0
* @provisional This API might change or be removed in a future release.
*/
public int getRuleStatusVec(int[] fillInArray) {
if (fillInArray != null && fillInArray.length>=1) {
fillInArray[0] = 0;
}
return 1;
}
/**
* This is the implementation function for next().
* @internal
* @deprecated This API is ICU internal only.
*/
protected int handleNext() {
CharacterIterator text = getText();
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
// start by using the inherited handleNext() to find a tentative return
// value. dictionaryCharCount tells us how many dictionary characters
// we passed over on our way to the tentative return value
int startPos = text.getIndex();
fDictionaryCharCount = 0;
int result = super.handleNext();
// if we passed over more than one dictionary character, then we use
// divideUpDictionaryRange() to regenerate the cached break positions
// for the new range.
if (!usingCTDictionary && fDictionaryCharCount > 1 && result - startPos > 1) {
divideUpDictionaryRange(startPos, result);
}
// otherwise, the value we got back from the inherited fuction
// is our return value, and we can dump the cache
else {
cachedBreakPositions = null;
return result;
}
}
// if the cache of break positions has been regenerated (or existed all
// along), then just advance to the next break position in the cache
// and return it
if (cachedBreakPositions != null) {
++positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
///CLOVER:OFF
Assert.assrt(false);
return -9999; // SHOULD NEVER GET HERE!
///CLOVER:ON
}
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
@SuppressWarnings("unchecked")
private void divideUpDictionaryRange(int startPos, int endPos) {
CharacterIterator text = getText();
// the range we're dividing may begin or end with non-dictionary characters
// (i.e., for line breaking, we may have leading or trailing punctuation
// that needs to be kept with the word). Seek from the beginning of the
// range to the first dictionary character
text.setIndex(startPos);
int c = CICurrent32(text);
while (isDictionaryChar(c) == false) {
c = CINext32(text);
}
//System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos);
// initialize. We maintain two stacks: currentBreakPositions contains
// the list of break positions that will be returned if we successfully
// finish traversing the whole range now. possibleBreakPositions lists
// all other possible word ends we've passed along the way. (Whenever
// we reach an error [a sequence of characters that can't begin any word
// in the dictionary], we back up, possibly delete some breaks from
// currentBreakPositions, move a break from possibleBreakPositions
// to currentBreakPositions, and start over from there. This process
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.)
Stack<Integer> currentBreakPositions = new Stack<Integer>();
Stack<Integer> possibleBreakPositions = new Stack<Integer>();
List<Integer> wrongBreakPositions = new ArrayList<Integer>();
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
int state = 0;
// these two variables are used for error handling. We keep track of the
// farthest we've gotten through the range being divided, and the combination
// of breaks that got us that far. If we use up all possible break
// combinations, the text contains an error or a word that's not in the
// dictionary. In this case, we "bless" the break positions that got us the
// farthest as real break positions, and then start over from scratch with
// the character where the error occurred.
int farthestEndPoint = text.getIndex();
Stack<Integer> bestBreakPositions = null;
// initialize (we always exit the loop with a break statement)
c = CICurrent32(text);
while (true) {
//System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex());
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (dictionary.at(state, 0) == -1) {
possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
}
// look up the new state to transition to in the dictionary
// There will be no supplementaries here because the Thai dictionary
// does not include any. This code is going away soon, not worth
// fixing.
state = (dictionary.at(state, (char)c)) & 0xFFFF; // TODO: fix supplementaries
//System.out.print(", state = " + state);
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == /*-1*/ 0xFFFF) {
currentBreakPositions.push(Integer.valueOf(text.getIndex()));
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
// without transitioning to the "end of word" state, we've hit
// an error...
else if (state == 0 || text.getIndex() >= endPos) {
// if this is the farthest we've gotten, take note of it in
// case there's an error in the text
if (text.getIndex() > farthestEndPoint) {
farthestEndPoint = text.getIndex();
bestBreakPositions = (Stack<Integer>)(currentBreakPositions.clone());
}
// wrongBreakPositions is a list of all break positions we've tried starting
// that didn't allow us to traverse all the way through the text. Every time
// we pop a break position off of currentBreakPositions, we put it into
// wrongBreakPositions to avoid trying it again later. If we make it to this
// spot, we're either going to back up to a break in possibleBreakPositions
// and try starting over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure. This loop prevents
// us from messing with anything in possibleBreakPositions that didn't work as
// a starting point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
// variable not used Integer newStartingSpot = null;
while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
possibleBreakPositions.peek())) {
possibleBreakPositions.pop();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
// over, treating the farthest character we've reached as the beginning
// of the range, and "blessing" the break positions that got us that
// far as real break positions
if (possibleBreakPositions.isEmpty()) {
if (bestBreakPositions != null) {
currentBreakPositions = bestBreakPositions;
if (farthestEndPoint < endPos) {
text.setIndex(farthestEndPoint + 1);
}
else {
break;
}
}
else {
if ((currentBreakPositions.size() == 0
|| currentBreakPositions.peek().intValue() != text.getIndex())
&& text.getIndex() != startPos) {
currentBreakPositions.push(Integer.valueOf(text.getIndex()));
}
CINext32(text);
currentBreakPositions.push(Integer.valueOf(text.getIndex()));
}
}
// if we still have more break positions we can try, then promote the
// last break in possibleBreakPositions into currentBreakPositions,
// and get rid of all entries in currentBreakPositions that come after
// it. Then back up to that position and start over from there (i.e.,
// treat that position as the beginning of a new word)
else {
Integer temp = possibleBreakPositions.pop();
Integer temp2 = null;
while (!currentBreakPositions.isEmpty() && temp.intValue() <
currentBreakPositions.peek().intValue()) {
temp2 = currentBreakPositions.pop();
wrongBreakPositions.add(temp2);
}
currentBreakPositions.push(temp);
text.setIndex(currentBreakPositions.peek().intValue());
}
// re-sync "c" for the next go-round, and drop out of the loop if
// we've made it off the end of the range
c = CICurrent32(text);
state = 0;
if (text.getIndex() >= endPos) {
break;
}
}
// if we didn't hit any exceptional conditions on this last iteration,
// just advance to the next character and loop
else {
c = CINext32(text);
}
//System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}");
//System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}");
}
// dump the last break position in the list, and replace it with the actual
// end of the range (which may be the same character, or may be further on
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.pop();
}
currentBreakPositions.push(Integer.valueOf(endPos));
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()
// and previous(), so this is where we actually refresh the cache.
cachedBreakPositions = new int[currentBreakPositions.size() + 1];
cachedBreakPositions[0] = startPos;
for (int i = 0; i < currentBreakPositions.size(); i++) {
cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
}
positionInCache = 0;
}
}

View File

@ -0,0 +1,69 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.util.Stack;
abstract class DictionaryBreakEngine implements LanguageBreakEngine {
protected UnicodeSet fSet = new UnicodeSet();
private final int fTypes;
/**
* @param breakTypes A mask of the break iterators that can use this engine.
* For example, (1 << KIND_WORD) | (1 << KIND_LINE) could be used by
* word iterators and line iterators, but not any other kind.
*/
public DictionaryBreakEngine(int breakTypes) {
// TODO: consider using a java.util.BitSet with nbits <= 32
fTypes = breakTypes;
}
public boolean handles(int c, int breakType) {
return (breakType >= 0 && breakType < 32) && // breakType is in range
((1 << breakType) & fTypes) != 0 && // this type can use us
fSet.contains(c); // we recognize the character
}
public int findBreaks(CharacterIterator text_, int startPos, int endPos,
boolean reverse, int breakType, Stack<Integer> foundBreaks) {
if (breakType < 0 || breakType >= 32 ||
((1 << breakType) & fTypes) == 0) {
return 0;
}
int result = 0;
UCharacterIterator text = UCharacterIterator.getInstance(text_);
int start = text.getIndex();
int current, rangeStart, rangeEnd;
int c = text.current();
if (reverse) {
boolean isDict = fSet.contains(c);
while ((current = text.getIndex()) > startPos && isDict) {
c = text.previous();
isDict = fSet.contains(c);
}
rangeStart = (current < startPos) ? startPos :
current + (isDict ? 0 : 1);
rangeEnd = start + 1;
} else {
while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
c = text.next();
}
rangeStart = start;
rangeEnd = current;
}
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
text.setIndex(current);
return result;
}
protected abstract int divideUpDictionaryRange(UCharacterIterator text,
int rangeStart, int rangeEnd, Stack<Integer> foundBreaks);
}

View File

@ -0,0 +1,90 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.util.UResourceBundle;
final class DictionaryData {
// disallow instantiation
private DictionaryData() { }
public static final int TRIE_TYPE_BYTES = 0;
public static final int TRIE_TYPE_UCHARS = 1;
public static final int TRIE_TYPE_MASK = 7;
public static final int TRIE_HAS_VALUES = 8;
public static final int TRANSFORM_NONE = 0;
public static final int TRANSFORM_TYPE_OFFSET = 0x1000000;
public static final int TRANSFORM_TYPE_MASK = 0x7f000000;
public static final int TRANSFORM_OFFSET_MASK = 0x1fffff;
public static final int IX_STRING_TRIE_OFFSET = 0;
public static final int IX_RESERVED1_OFFSET = 1;
public static final int IX_RESERVED2_OFFSET = 2;
public static final int IX_TOTAL_SIZE = 3;
public static final int IX_TRIE_TYPE = 4;
public static final int IX_TRANSFORM = 5;
public static final int IX_RESERVED6 = 6;
public static final int IX_RESERVED7 = 7;
public static final int IX_COUNT = 8;
private static final byte DATA_FORMAT_ID[] = { (byte) 0x44, (byte) 0x69,
(byte) 0x63, (byte) 0x74 };
public static DictionaryMatcher loadDictionaryFor(String dictType) throws IOException {
ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME);
String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
InputStream is = ICUData.getStream(dictFileName);
ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
DataInputStream s = new DataInputStream(is);
int[] indexes = new int[IX_COUNT];
// TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[]
for (int i = 0; i < IX_COUNT; i++) {
indexes[i] = s.readInt();
}
int offset = indexes[IX_STRING_TRIE_OFFSET];
Assert.assrt(offset >= (4 * IX_COUNT));
if (offset > (4 * IX_COUNT)) {
int diff = offset - (4 * IX_COUNT);
s.skipBytes(diff);
}
int trieType = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK;
int totalSize = indexes[IX_TOTAL_SIZE] - offset;
DictionaryMatcher m = null;
if (trieType == TRIE_TYPE_BYTES) {
int transform = indexes[IX_TRANSFORM];
byte[] data = new byte[totalSize];
int i;
for (i = 0; i < data.length; i++) {
data[i] = s.readByte();
}
Assert.assrt(i == totalSize);
m = new BytesDictionaryMatcher(data, transform);
} else if (trieType == TRIE_TYPE_UCHARS) {
Assert.assrt(totalSize % 2 == 0);
int num = totalSize / 2;
char[] data = new char[totalSize / 2];
for (int i = 0; i < num; i++) {
data[i] = s.readChar();
}
m = new CharsDictionaryMatcher(new String(data));
} else {
m = null;
}
s.close();
is.close();
return m;
}
}

View File

@ -0,0 +1,40 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
/**
* The DictionaryMatcher interface is used to allow arbitrary "types" of
* back-end data structures to be used with the break iteration code.
*/
abstract class DictionaryMatcher {
/**
* Find dictionary words that match the text.
*
* @param text A CharacterIterator representing the text. The iterator is
* left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The maximum amount of words to output. Must be less than or equal to lengths.length.
* @param values Filled with the weight values associated with the various words.
* @return The number of characters in text that were matched.
*/
public abstract int matches(CharacterIterator text, int maxLength, int[] lengths,
int[] count, int limit, int[] values);
public int matches(CharacterIterator text, int maxLength, int[] lengths,
int[] count, int limit) {
return matches(text, maxLength, lengths, count, limit, null);
}
/**
* @return the kind of dictionary that this matcher is using
*/
public abstract int getType();
}

View File

@ -0,0 +1,40 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.util.Stack;
/**
* The LanguageBreakEngine interface is to be used to implement any
* language-specific logic for break iteration.
*/
interface LanguageBreakEngine {
/**
* @param c A Unicode codepoint value
* @param breakType The kind of break iterator that is wanting to make use
* of this engine - character, word, line, sentence
* @return true if the engine can handle this character, false otherwise
*/
public boolean handles(int c, int breakType);
/**
* Implements the actual breaking logic.
* @param text The text to break over
* @param startPos The index of the beginning of our range
* @param endPos The index of the possible end of our range. It is possible,
* however, that our range ends earlier
* @param reverse true iff we are iterating backwards (in a call to
* previous(), for example)
* @param breakType The kind of break iterator that is wanting to make use
* of this engine - character, word, line, sentence
* @param foundBreaks A Stack that the breaks found will be added to
* @return the number of words found
*/
public int findBreaks(CharacterIterator text, int startPos, int endPos,
boolean reverse, int breakType, Stack<Integer> foundBreaks);
}

View File

@ -1,20 +1,20 @@
/*
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and *
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.IOException;
import java.io.InputStream;
import java.text.CharacterIterator;
import java.util.Stack;
import com.ibm.icu.impl.Assert;
class ThaiBreakIterator extends DictionaryBasedBreakIterator {
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
public class ThaiBreakEngine implements LanguageBreakEngine {
/* Helper class for improving readability of the Thai word break
* algorithm.
*/
@ -25,7 +25,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
//list of word candidate lengths, in increasing length order
private int lengths[];
private int count[]; // Count of candidates
private int prefix; // The longeset match with a dictionary word
private int prefix; // The longest match with a dictionary word
private int offset; // Offset in the text of these candidates
private int mark; // The preferred candidate's offset
private int current; // The candidate we're currently looking at
@ -38,7 +38,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
}
// Fill the list of candidates if needed, select the longest, and return the number found
public int candidates(CharacterIterator fIter, BreakCTDictionary dict, int rangeEnd) {
public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
int start = fIter.getIndex();
if (start != offset) {
offset = start;
@ -62,7 +62,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
return lengths[mark];
}
// Backup from the current candidate to the next shorter one; rreturn true if that exists
// Backup from the current candidate to the next shorter one; return true if that exists
// and point the text after it
public boolean backUp(CharacterIterator fIter) {
if (current > 0) {
@ -82,14 +82,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
mark = current;
}
}
private static UnicodeSet fThaiWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fSuffixSet;
private static UnicodeSet fMarkSet;
private BreakCTDictionary fDictionary;
// Constants for ThaiBreakIterator
// How many words in a row are "good enough"?
private static final byte THAI_LOOKAHEAD = 3;
@ -104,9 +97,14 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
private static final char THAI_MAIYAMOK = 0x0E46;
// Minimum word size
private static final byte THAI_MIN_WORD = 2;
// Minimum number of characters for two words
//private final int THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
private DictionaryMatcher fDictionary;
private static UnicodeSet fThaiWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fSuffixSet;
private static UnicodeSet fMarkSet;
static {
// Initialize UnicodeSets
fThaiWordSet = new UnicodeSet();
@ -141,73 +139,28 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
fBeginWordSet.freeze();
fSuffixSet.freeze();
}
public ThaiBreakIterator(InputStream ruleStream, InputStream dictionaryStream) throws IOException {
super(ruleStream);
// Initialize diciontary
fDictionary = new BreakCTDictionary(dictionaryStream);
public ThaiBreakEngine() throws IOException {
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Thai");
}
/**
* This is the implementation function for next().
*/
protected int handleNext() {
CharacterIterator text = getText();
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
// start by using the inherited handleNext() to find a tentative return
// value. dictionaryCharCount tells us how many dictionary characters
// we passed over on our way to the tentative return value
int startPos = text.getIndex();
fDictionaryCharCount = 0;
int result = super.handleNext();
// if we passed over more than one dictionary character, then we use
// divideUpDictionaryRange() to regenerate the cached break positions
// for the new range
if (fDictionaryCharCount > 1 && result - startPos > 1) {
divideUpDictionaryRange(startPos, result);
}
// otherwise, the value we got back from the inherited fuction
// is our return value, and we can dump the cache
else {
cachedBreakPositions = null;
return result;
}
public boolean handles(int c, int breakType) {
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.THAI);
}
// if the cache of break positions has been regenerated (or existed all
// along), then just advance to the next break position in the cache
// and return it
if (cachedBreakPositions != null) {
++positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
Assert.assrt(false);
return -9999; // SHOULD NEVER GET HERE!
return false;
}
/**
* Divide up a range of known dictionary characters.
*
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @return The number of breaks found
*/
private int divideUpDictionaryRange(int rangeStart, int rangeEnd) {
public int findBreaks(CharacterIterator fIter, int rangeStart, int rangeEnd, boolean reverse, int breakType,
Stack<Integer> foundBreaks) {
if ((rangeEnd - rangeStart) < THAI_MIN_WORD) {
return 0; // Not enough chacters for word
return 0; // Not enough characters for word
}
CharacterIterator fIter = getText();
int wordsFound = 0;
int wordLength;
int current;
Stack<Integer> foundBreaks = new Stack<Integer>();
PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
for (int i = 0; i < THAI_LOOKAHEAD; i++) {
words[i] = new PossibleWord();
@ -228,7 +181,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
wordsFound += 1;
}
// If there was more than one, see which one can take use forward the most words
// If there was more than one, see which one can take us forward the most words
else if (candidates > 1) {
boolean foundBest = false;
// If we're already at the end of the range, we're done
@ -259,9 +212,10 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
}
} while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter) && !foundBest);
}
/* foundBest: */wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
wordsFound += 1;
}
// We come here after having either found a word or not. We look ahead to the
// next word. If it's not a dictionary word, we will combine it with the word we
// just found (if there is one), but only if the preceding word does not exceed
@ -291,8 +245,8 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
// two characters after uc were not 0x0E4C THANTHAKHAT before
// checking the dictionary. That is just a performance filter,
// but it's not clear it's faster than checking the trie
int candidate = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
fIter.setIndex(current+wordLength+chars);
int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
fIter.setIndex(current + wordLength + chars);
if (candidate > 0) {
break;
}
@ -300,7 +254,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
pc = uc;
}
// Bump the word cound if there wasn't already one
// Bump the word count if there wasn't already one
if (wordLength <= 0) {
wordsFound += 1;
}
@ -351,13 +305,13 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
}
}
} else {
fIter.setIndex(current+wordLength);
fIter.setIndex(current + wordLength);
}
}
// Did we find a word on this iteration? If so, push it on the break stack
if (wordLength > 0) {
foundBreaks.push(Integer.valueOf(current+wordLength));
foundBreaks.push(Integer.valueOf(current + wordLength));
}
}
@ -367,16 +321,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
wordsFound -= 1;
}
// Store the break points in cachedBreakPositions.
cachedBreakPositions = new int[foundBreaks.size() + 2];
cachedBreakPositions[0] = rangeStart;
int i;
for (i = 0; i < foundBreaks.size(); i++) {
cachedBreakPositions[i + 1] = foundBreaks.elementAt(i).intValue();
}
cachedBreakPositions[i + 1] = rangeEnd;
positionInCache = 0;
return wordsFound;
}
}

View File

@ -0,0 +1,46 @@
/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.util.Stack;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import static com.ibm.icu.impl.CharacterIteration.*;
public final class UnhandledBreakEngine implements LanguageBreakEngine {
// TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
// in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
public UnhandledBreakEngine() {
for (int i = 0; i < fHandled.length; i++) {
fHandled[i] = new UnicodeSet();
}
}
public boolean handles(int c, int breakType) {
return (breakType >= 0 && breakType < fHandled.length) &&
(fHandled[breakType].contains(c));
}
public int findBreaks(CharacterIterator text, int startPos, int endPos,
boolean reverse, int breakType, Stack<Integer> foundBreaks) {
text.setIndex(endPos);
return 0;
}
public synchronized void handleChar(int c, int breakType) {
if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
if (!fHandled[breakType].contains(c)) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
}
}
}
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a243a8584459d751b33c922f2fbfaea27200721a1a27661b5fa2ec96bb5fc6e2
size 7929565
oid sha256:23641fd85dfa40f916a7a5b47a6dc8ebd591862a9fe2d62ddcd46b7f1a862d36
size 9286396

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fc6ebf5e136b448a03a7e74463c67d96217cc9f9d3feed4d2aa7f74dc5e25e63
oid sha256:e951e7a3cc20e7126326db97e92ce533db611fde39c201795680246fde86c8e0
size 97666

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2029b2752b52d544749fffea9b2574ddfd19ea278cf5f26243efd98bd3f15313
size 719725
oid sha256:54eeee6d7834231edb7d2d9bd3174d3c4347c737f556bc6b25915bb6860b6fe2
size 719912

View File

@ -1,16 +1,11 @@
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* Copyright (C) 1996-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.rbbi;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.StringCharacterIterator;
import java.util.ArrayList;
import java.util.List;
@ -18,7 +13,6 @@ import java.util.Locale;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.DictionaryBasedBreakIterator;
public class BreakIteratorTest extends TestFmwk
{
@ -849,52 +843,4 @@ public class BreakIteratorTest extends TestFmwk
errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage());
}
}
/*
* Tests the constructors public DictionaryBasedBreakIterator(String rules, ... public
* DictionaryBasedBreakIterator(InputStream compiledRules, ...
*/
public void TestDictionaryBasedBreakIterator() throws IOException {
// The following class allows the testing of the constructor
// public DictionaryBasedBreakIterator(String rules, ...
class TestDictionaryBasedBreakIterator extends DictionaryBasedBreakIterator {
public TestDictionaryBasedBreakIterator(InputStream is) throws IOException {
super("", is);
}
}
try {
@SuppressWarnings("unused")
TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(null);
errln("DictionaryBasedBreakIterator constructor is suppose to return an "
+ "exception for an empty string.");
} catch (Exception e) {
}
try {
File file = File.createTempFile("dummy", "");
FileInputStream fis = new FileInputStream(file);
DataInputStream dis = new DataInputStream(fis);
@SuppressWarnings("unused")
TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(dis);
errln("DictionaryBasedBreakIterator constructor is suppose to return an "
+ "exception for a temporary file with EOF.");
} catch (Exception e) {
}
// The following class allows the testing of the constructor
// public DictionaryBasedBreakIterator(InputStream compiledRules, ...
class TestDictionaryBasedBreakIterator1 extends DictionaryBasedBreakIterator {
public TestDictionaryBasedBreakIterator1() throws IOException {
super((InputStream) null, (InputStream) null);
}
}
try {
@SuppressWarnings("unused")
TestDictionaryBasedBreakIterator1 td1 = new TestDictionaryBasedBreakIterator1();
errln("DictionaryBasedBreakIterator constructor is suppose to return an "
+ "exception for an null input stream.");
} catch (Exception e) {
}
}
}
}

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and
* Copyright (C) 1996-2012, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -20,7 +20,6 @@ import java.util.List;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
@ -584,7 +583,7 @@ public class RBBITest extends TestFmwk {
errln("Incorrect following position.");
}
int []fillInArray = new int[2];
if (((DictionaryBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
if (((RuleBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
errln("Error: Since getRuleStatusVec is not supported in DictionaryBasedBreakIterator, it should return 1 and fillInArray[0] == 0.");
}
}
@ -663,11 +662,6 @@ public class RBBITest extends TestFmwk {
final String posxWordText = "Can't have breaks in xx:yy or struct.field for CS-types.";
final int[] posxWordTOffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
final int[] posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
// KIND_WORD "ja"
final String jaWordText = "\u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF" +
"\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002";
final int[] jaWordTOffsets = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 };
final int[] jaWordROffsets = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
// KIND_SENTENCE "el"
final String elSentText = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " +
"\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3";
@ -688,8 +682,6 @@ public class RBBITest extends TestFmwk {
final TBItem[] tests = {
new TBItem( BreakIterator.KIND_WORD, new ULocale("en_US_POSIX"), posxWordText, posxWordTOffsets ),
new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, posxWordText, posxWordROffsets ),
new TBItem( BreakIterator.KIND_WORD, new ULocale("ja"), jaWordText, jaWordTOffsets ),
new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, jaWordText, jaWordROffsets ),
new TBItem( BreakIterator.KIND_SENTENCE, new ULocale("el"), elSentText, elSentTOffsets ),
new TBItem( BreakIterator.KIND_SENTENCE, ULocale.ROOT, elSentText, elSentROffsets ),
new TBItem( BreakIterator.KIND_CHARACTER, new ULocale("th"), thCharText, thCharTOffsets ),

View File

@ -51,7 +51,6 @@ static class TestParams {
public void TestExtended() {
TestParams tp = new TestParams();
@ -434,6 +433,7 @@ void executeTest(TestParams t) {
}
}
//
// Run the iterator backwards, verify that the same breaks are found.
//

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2003-2011 International Business Machines Corporation and
* Copyright (C) 2003-2012 International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -264,15 +264,19 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fExtendSet;
UnicodeSet fExtendNumLetSet;
UnicodeSet fOtherSet;
UnicodeSet fDictionaryCjkSet;
RBBIWordMonkey() {
fCharProperty = UProperty.WORD_BREAK;
fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]");
fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
fALetterSet.removeAll(fDictionaryCjkSet);
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
@ -297,13 +301,14 @@ public class RBBITestMonkey extends TestFmwk {
fOtherSet.removeAll(fExtendNumLetSet);
// Inhibit dictionary characters from being tested at all.
fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
fOtherSet.removeAll(fDictionaryCjkSet);
fSets = new ArrayList();
fSets.add(fCRSet);
fSets.add(fLFSet);
fSets.add(fNewlineSet);
fSets.add(fALetterSet);
fSets.add(fKatakanaSet);
//fSets.add(fKatakanaSet); // TODO: work out how to test katakana
fSets.add(fMidLetterSet);
fSets.add(fMidNumLetSet);
fSets.add(fMidNumSet);
@ -1484,7 +1489,6 @@ public class RBBITestMonkey extends TestFmwk {
/**
* return the index of the next code point in the input text.
* @param i the preceding index
* @return
*/
static int nextCP(StringBuffer s, int i) {
if (i == -1) {

View File

@ -1,19 +1,15 @@
/*
*******************************************************************************
* Copyright (C) 1996-2006, International Business Machines Corporation and *
* Copyright (C) 1996-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.rbbi;
import java.io.IOException;
import java.io.InputStream;
import java.util.ListResourceBundle;
import java.util.MissingResourceException;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
// TODO: {dlf} this test currently doesn't test anything!
@ -160,30 +156,12 @@ public class SimpleBITest extends TestFmwk{
"Character", "Word", "Line", "Sentence"
};
String rulesName = kindNames[kind] + "BreakRules";
String dictionaryName = kindNames[kind] + "BreakDictionary";
String[] classNames = bundle.getStringArray("BreakIteratorClasses");
String rules = bundle.getString(rulesName);
if (classNames[kind].equals("RuleBasedBreakIterator")) {
iter = new RuleBasedBreakIterator(rules);
}
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
try {
String dictionaryPath = bundle.getString(dictionaryName);
InputStream dictionary = bundle.getClass().getResourceAsStream(dictionaryPath);
System.out.println("looking for " + dictionaryPath + " from " + bundle.getClass() + " returned " + dictionary);
iter = new DictionaryBasedBreakIterator(rules, dictionary);
}
catch(IOException e) {
e.printStackTrace();
errln(e.getMessage());
System.out.println(e); // debug
}
catch(MissingResourceException e) {
errln(e.getMessage());
System.out.println(e); // debug
}
}
if (iter == null) {
errln("could not create iterator");
}

View File

@ -33,9 +33,8 @@
# Temp debugging tests
<locale en>
<line>
<data>•Hello, •World.•</data>
<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
########################################################################################
#
@ -171,7 +170,14 @@
<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
# Hiragana & Katakana stay together, but separates from each other and Latin.
<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
# *** what to do about theoretical combos of chars? i.e. hiragana + accent
#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#•</data>
# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
<data>•芽キャベツ<400>芽キャベツ<400></data>
# Testing of word boundary for dictionary word containing both kanji and kana
<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
# Words with interior formatting characters
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
@ -179,7 +185,6 @@
# to test for bug #4097779
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
# to test for bug #4098467
# What follows is a string of Korean characters (I found it in the Yellow Pages
# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
@ -188,9 +193,14 @@
# precomposed syllables...
<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
# more Korean tests (Jamo not tested here, not counted as dictionary characters)
# Disable them now because we don't include a Korean dictionary.
#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
#<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
<data>•\u06c9<200>\uc799<200>\ufffa•</data>
<data>•\u06c9\uc799\ufffa<200></data>
#
# Try some words from other scripts.
@ -507,8 +517,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
# conjoining jamo...
# TODO: rules update needed
#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
<data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
@ -572,17 +581,17 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
# Test data originally from the test code source file
# // @suwit -- Thai sample data from GVT Guideline
#
#<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
#\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
#\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
#\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
#
## Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
#<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
#
#<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
#\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
#\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
<line>
<data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\
@ -619,22 +628,22 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
# @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start
#
<line>
#<data>•\u0E1B\u0E35•\
#\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
#2545 •\
#\u0E40\u0E1B\u0E47\u0E19•\
#\u0E1B\u0E35•\
#\u0E09\u0E25\u0E2D\u0E07•\
#\u0E04\u0E23\u0E1A•\
#\u0E23\u0E2D\u0E1A •\
#\"\u0E52\u0E52\u0E50 •\
#\u0E1b\u0E35\" •\
#\u0E02\u0E2d\u0E07•\
#\u0E01\u0E23\u0E38\u0E07•\
#\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\
#(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F\
#\u0E2B\u0E23\u0E37\u0E2D •\
#Bangkok)•</data>
<data>•\u0E1B\u0E35•\
\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
2545 •\
\u0E40\u0E1B\u0E47\u0E19•\
\u0E1B\u0E35•\
\u0E09\u0E25\u0E2D\u0E07•\
\u0E04\u0E23\u0E1A•\
\u0E23\u0E2D\u0E1A •\
\"\u0E52\u0E52\u0E50 •\
\u0E1b\u0E35\" •\
\u0E02\u0E2d\u0E07•\
\u0E01\u0E23\u0E38\u0E07•\
\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\
(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F\
\u0E2B\u0E23\u0E37\u0E2D •\
Bangkok)•</data>
# Data originally from RBBITest::TestMaiyamok()
# The Thai maiyamok character is a shorthand symbol that means "repeat the previous
@ -652,58 +661,6 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
\u0e22\u0e07•\
\u0e43\u0e2b\u0e21\u0e48•</data>
##########################################################################################
#
# Khmer Tests
#
##########################################################################################
# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
# from the file testdata/wordsegments.txt
<locale th>
<word>
#<data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data>
#<data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data>
#<data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data>
##ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
#<data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>អាច<200>ចូល<200></data>
##ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
#<data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់<200>ព្រះអង្គ<200></data>
#<data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data>
#<data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data>
#<data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></data>
#<data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data>
#<data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data>
#<data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></data>
#<data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data>
#<data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data>
#<data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data>
#<data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data>
#<data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200></data>
#<data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data>
#<data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data>
##អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data>
#<data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data>
#<data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data>
#<data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data>
#
# Jitterbug 3671 Test Case
#
#<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>
#
# Trac ticket 5595 Test Case
#<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\
#ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\
#ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\
#สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\
#ไมล์<200></data>
####################################################################################
#
# Tailored (locale specific) breaking.
@ -714,7 +671,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
<locale ja>
<line>
<data>•\u3041\u3043\u3045\u31f1•</data>
<data>•\u3041\u3043\u3045\u31f1•</data>
<locale en>
<line>
<data>•\u3041\u3043\u3045\u31f1•</data>
@ -722,19 +679,20 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
# The following data was originally in RBBITest::TestJapaneseWordBreak()
<locale ja>
<word>
<data>•\u4ECA\u65E5<400>\u306F\u3044\u3044<300>\u5929\u6C17<400>\u3067\u3059\u306D<300>\u3002•\u000D\u000A•</data>
<data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A•</data>
# UBreakIteratorType UBRK_WORD, Locale "ja"
# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
# modified to work with dbbi code - should verify
<locale ja>
<word>
<data>•私達<400>に<300>一〇〇〇<400>の<300>コンピュータ<300>がある<300>。<0>奈々<400>は<300>ワード<300>である<300>。•</data>
<data>•私<400>達<400>に<400>一<400><400><400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>
<locale root>
<word>
<data>•私<400>達<400>に<300>一<400><400><400><400>の<300>コンピュータ<300>が<300>あ<300>る<300>。<0>奈<400>々<200>は<300>ワード<300>で<300>あ<300>る<300>。•</data>
<data>•私<400>達<400>に<400>一<400><400><400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>
# UBreakIteratorType UBRK_SENTENCE, Locale "el"
# Add break after Greek question mark (cldrbug #2069).

View File

@ -474,18 +474,6 @@ public final class ICUResourceBundleTest extends TestFmwk {
errln("Did not get the expected output for referencingalias");
}
}
{
rb = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
sub = rb.get("boundaries");
String word = sub.getString("word");
if(word.equals("word_ja.brk")){
logln("Got the expected output for boundaries/word");
}else{
errln("Did not get the expected type for boundaries/word");
}
}
{
UResourceBundle rb1 = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
if(rb1!=rb){

View File

@ -104,23 +104,6 @@ public class ULocaleTest extends TestFmwk {
}
*/
public void TestBreakIterator() {
checkService("ja_JP_OSAKA", new ServiceFacade() {
public Object create(ULocale req) {
return BreakIterator.getWordInstance(req);
}
}, null, new Registrar() {
public Object register(ULocale loc, Object prototype) {
return BreakIterator.registerInstance(
(BreakIterator) prototype,
loc, BreakIterator.KIND_WORD);
}
public boolean unregister(Object key) {
return BreakIterator.unregister(key);
}
});
}
public void TestDateFormat() {
checkService("de_CH_ZURICH", new ServiceFacade() {
public Object create(ULocale req) {