ICU-9353 merge dbbi-tries work into the trunk
X-SVN-Rev: 32185
This commit is contained in:
parent
c64c0299d7
commit
ed2c14b425
@ -1583,7 +1583,7 @@
|
||||
<include name="**/pnames.icu"/>
|
||||
<include name="**/*.res"/>
|
||||
<include name="**/*.brk"/>
|
||||
<include name="**/*.ctd"/>
|
||||
<include name="**/*.dict"/>
|
||||
<include name="**/*.nrm"/>
|
||||
<exclude name="**/coll/*.res"/>
|
||||
<exclude name="**/translit/*.res"/>
|
||||
@ -1676,7 +1676,7 @@
|
||||
<include name="**/unames.icu"/>
|
||||
<include name="**/pnames.icu"/>
|
||||
<include name="**/*.brk"/>
|
||||
<include name="**/*.ctd"/>
|
||||
<include name="**/*.dict"/>
|
||||
<include name="**/*.nrm"/>
|
||||
<include name="**/brkitr/*.res"/>
|
||||
<include name="**/translit/*.res"/>
|
||||
|
@ -0,0 +1,126 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
public final class CharacterIteration {
|
||||
// disallow instantiation
|
||||
private CharacterIteration() { }
|
||||
|
||||
// 32 bit Char value returned from when an iterator has run out of range.
|
||||
// Positive value so fast case (not end, not surrogate) can be checked
|
||||
// with a single test.
|
||||
public static int DONE32 = 0x7fffffff;
|
||||
|
||||
/**
|
||||
* Move the iterator forward to the next code point, and return that code point,
|
||||
* leaving the iterator positioned at char returned.
|
||||
* For Supplementary chars, the iterator is left positioned at the lead surrogate.
|
||||
* @param ci The character iterator
|
||||
* @return The next code point.
|
||||
*/
|
||||
public static int next32(CharacterIterator ci) {
|
||||
// If the current position is at a surrogate pair, move to the trail surrogate
|
||||
// which leaves it in positon for underlying iterator's next() to work.
|
||||
int c= ci.current();
|
||||
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
c = ci.next();
|
||||
if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
c = ci.previous();
|
||||
}
|
||||
}
|
||||
|
||||
// For BMP chars, this next() is the real deal.
|
||||
c = ci.next();
|
||||
|
||||
// If we might have a lead surrogate, we need to peak ahead to get the trail
|
||||
// even though we don't want to really be positioned there.
|
||||
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
c = nextTrail32(ci, c);
|
||||
}
|
||||
|
||||
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) {
|
||||
// We got a supplementary char. Back the iterator up to the postion
|
||||
// of the lead surrogate.
|
||||
ci.previous();
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
// Out-of-line portion of the in-line Next32 code.
|
||||
// The call site does an initial ci.next() and calls this function
|
||||
// if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
|
||||
// NOTE: we leave the underlying char iterator positioned in the
|
||||
// middle of a surroage pair. ci.next() will work correctly
|
||||
// from there, but the ci.getIndex() will be wrong, and needs
|
||||
// adjustment.
|
||||
public static int nextTrail32(CharacterIterator ci, int lead) {
|
||||
int retVal = lead;
|
||||
if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
char cTrail = ci.next();
|
||||
if (UTF16.isTrailSurrogate(cTrail)) {
|
||||
retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
|
||||
(cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
|
||||
UTF16.SUPPLEMENTARY_MIN_VALUE;
|
||||
} else {
|
||||
ci.previous();
|
||||
}
|
||||
} else {
|
||||
if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
|
||||
retVal = DONE32;
|
||||
}
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
public static int previous32(CharacterIterator ci) {
|
||||
if (ci.getIndex() <= ci.getBeginIndex()) {
|
||||
return DONE32;
|
||||
}
|
||||
char trail = ci.previous();
|
||||
int retVal = trail;
|
||||
if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
|
||||
char lead = ci.previous();
|
||||
if (UTF16.isLeadSurrogate(lead)) {
|
||||
retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
|
||||
((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
|
||||
UTF16.SUPPLEMENTARY_MIN_VALUE;
|
||||
} else {
|
||||
ci.next();
|
||||
}
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
public static int current32(CharacterIterator ci) {
|
||||
char lead = ci.current();
|
||||
int retVal = lead;
|
||||
if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
return retVal;
|
||||
}
|
||||
if (UTF16.isLeadSurrogate(lead)) {
|
||||
int trail = (int)ci.next();
|
||||
ci.previous();
|
||||
if (UTF16.isTrailSurrogate((char)trail)) {
|
||||
retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
|
||||
(trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
|
||||
UTF16.SUPPLEMENTARY_MIN_VALUE;
|
||||
}
|
||||
} else {
|
||||
if (lead == CharacterIterator.DONE) {
|
||||
if (ci.getIndex() >= ci.getEndIndex()) {
|
||||
retVal = DONE32;
|
||||
}
|
||||
}
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
}
|
@ -732,6 +732,11 @@ s */
|
||||
|
||||
BreakIteratorCache cache = new BreakIteratorCache(where, result);
|
||||
iterCache[kind] = new SoftReference<BreakIteratorCache>(cache);
|
||||
if (result instanceof RuleBasedBreakIterator) {
|
||||
RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
|
||||
rbbi.setBreakType(kind);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2002-2010, International Business Machines Corporation and *
|
||||
* Copyright (C) 2002-2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -90,28 +90,20 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
||||
* pre-compiled break rules. The resource bundle name is "boundaries".
|
||||
* The value for each key will be the rules to be used for the
|
||||
* specified locale - "word" -> "word_th" for Thai, for example.
|
||||
* DICTIONARY_POSSIBLE indexes in the same way, and indicates whether a
|
||||
* dictionary is a possibility for that type of break. This is just
|
||||
* an optimization to avoid a resource lookup where no dictionary is
|
||||
* ever possible.
|
||||
*/
|
||||
private static final String[] KIND_NAMES = {
|
||||
"grapheme", "word", "line", "sentence", "title"
|
||||
};
|
||||
private static final boolean[] DICTIONARY_POSSIBLE = {
|
||||
false, true, true, false, false
|
||||
};
|
||||
|
||||
|
||||
private static BreakIterator createBreakInstance(ULocale locale, int kind) {
|
||||
|
||||
BreakIterator iter = null;
|
||||
ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
|
||||
RuleBasedBreakIterator iter = null;
|
||||
ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
|
||||
|
||||
//
|
||||
// Get the binary rules. These are needed for both normal RulesBasedBreakIterators
|
||||
// and for Dictionary iterators.
|
||||
//
|
||||
// Get the binary rules.
|
||||
//
|
||||
InputStream ruleStream = null;
|
||||
try {
|
||||
String typeKey = KIND_NAMES[kind];
|
||||
@ -122,51 +114,22 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
||||
catch (Exception e) {
|
||||
throw new MissingResourceException(e.toString(),"","");
|
||||
}
|
||||
|
||||
//
|
||||
// Check whether a dictionary exists, and create a DBBI iterator is
|
||||
// one does.
|
||||
//
|
||||
if (DICTIONARY_POSSIBLE[kind]) {
|
||||
// This type of break iterator could potentially use a dictionary.
|
||||
//
|
||||
try {
|
||||
if (locale.getLanguage().equals("th")){
|
||||
// If the language is Thai, load the thai compact trie dictionary.
|
||||
String dictType = "Thai";
|
||||
String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
|
||||
dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
|
||||
InputStream is = ICUData.getStream(dictFileName);
|
||||
iter = new ThaiBreakIterator(ruleStream, is);
|
||||
}
|
||||
} catch (MissingResourceException e) {
|
||||
// Couldn't find a dictionary.
|
||||
// This is normal, and will occur whenever creating a word or line
|
||||
// break iterator for a locale that does not have a BreakDictionaryData
|
||||
// resource - meaning for all but Thai.
|
||||
// Fall through to creating a normal RulebasedBreakIterator.
|
||||
} catch (IOException e) {
|
||||
Assert.fail(e);
|
||||
}
|
||||
}
|
||||
|
||||
if (iter == null) {
|
||||
//
|
||||
// Create a normal RuleBasedBreakIterator.
|
||||
// We have determined that this is not supposed to be a dictionary iterator.
|
||||
//
|
||||
try {
|
||||
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Shouldn't be possible to get here.
|
||||
// If it happens, the compiled rules are probably corrupted in some way.
|
||||
Assert.fail(e);
|
||||
}
|
||||
//
|
||||
// Create a normal RuleBasedBreakIterator.
|
||||
//
|
||||
try {
|
||||
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Shouldn't be possible to get here.
|
||||
// If it happens, the compiled rules are probably corrupted in some way.
|
||||
Assert.fail(e);
|
||||
}
|
||||
// TODO: Determine valid and actual locale correctly.
|
||||
ULocale uloc = ULocale.forLocale(rb.getLocale());
|
||||
iter.setLocale(uloc, uloc);
|
||||
iter.setBreakType(kind);
|
||||
|
||||
return iter;
|
||||
|
||||
|
@ -0,0 +1,83 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.util.BytesTrie;
|
||||
import com.ibm.icu.util.BytesTrie.Result;
|
||||
|
||||
class BytesDictionaryMatcher extends DictionaryMatcher {
|
||||
private final byte[] characters;
|
||||
private final int transform;
|
||||
|
||||
public BytesDictionaryMatcher(byte[] chars, int transform) {
|
||||
characters = chars;
|
||||
Assert.assrt((transform & DictionaryData.TRANSFORM_TYPE_MASK) == DictionaryData.TRANSFORM_TYPE_OFFSET);
|
||||
// while there is only one transform type so far, save the entire transform constant so that
|
||||
// if we add any others, we need only change code in transform() and the assert above rather
|
||||
// than adding a "transform type" variable
|
||||
this.transform = transform;
|
||||
}
|
||||
|
||||
private int transform(int c) {
|
||||
if (c == 0x200D) {
|
||||
return 0xFF;
|
||||
} else if (c == 0x200C) {
|
||||
return 0xFE;
|
||||
}
|
||||
|
||||
int delta = c - (transform & DictionaryData.TRANSFORM_OFFSET_MASK);
|
||||
if (delta < 0 || 0xFD < delta) {
|
||||
return -1;
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
|
||||
public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
|
||||
UCharacterIterator text = UCharacterIterator.getInstance(text_);
|
||||
BytesTrie bt = new BytesTrie(characters, 0);
|
||||
int c = text.nextCodePoint();
|
||||
Result result = bt.first(transform(c));
|
||||
// TODO: should numChars count Character.charCount() ?
|
||||
int numChars = 1;
|
||||
int count = 0;
|
||||
for (;;) {
|
||||
if (result.hasValue()) {
|
||||
if (count < limit) {
|
||||
if (values != null) {
|
||||
values[count] = bt.getValue();
|
||||
}
|
||||
lengths[count] = numChars;
|
||||
count++;
|
||||
}
|
||||
if (result == Result.FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
} else if (result == Result.NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (numChars >= maxLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
c = text.nextCodePoint();
|
||||
++numChars;
|
||||
result = bt.next(transform(c));
|
||||
}
|
||||
count_[0] = count;
|
||||
return numChars;
|
||||
}
|
||||
|
||||
public int getType() {
|
||||
return DictionaryData.TRIE_TYPE_BYTES;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,61 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.util.BytesTrie.Result;
|
||||
import com.ibm.icu.util.CharsTrie;
|
||||
|
||||
class CharsDictionaryMatcher extends DictionaryMatcher {
|
||||
private CharSequence characters;
|
||||
|
||||
public CharsDictionaryMatcher(CharSequence chars) {
|
||||
characters = chars;
|
||||
}
|
||||
|
||||
public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
|
||||
UCharacterIterator text = UCharacterIterator.getInstance(text_);
|
||||
CharsTrie uct = new CharsTrie(characters, 0);
|
||||
int c = text.nextCodePoint();
|
||||
Result result = uct.firstForCodePoint(c);
|
||||
// TODO: should numChars count Character.charCount?
|
||||
int numChars = 1;
|
||||
int count = 0;
|
||||
for (;;) {
|
||||
if (result.hasValue()) {
|
||||
if (count < limit) {
|
||||
if (values != null) {
|
||||
values[count] = uct.getValue();
|
||||
}
|
||||
lengths[count] = numChars;
|
||||
count++;
|
||||
}
|
||||
|
||||
if (result == Result.FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
} else if (result == Result.NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (numChars >= maxLength) {
|
||||
break;
|
||||
}
|
||||
c = text.nextCodePoint();
|
||||
++numChars;
|
||||
result = uct.nextForCodePoint(c);
|
||||
}
|
||||
count_[0] = count;
|
||||
return numChars;
|
||||
}
|
||||
|
||||
public int getType() {
|
||||
return DictionaryData.TRIE_TYPE_UCHARS;
|
||||
}
|
||||
}
|
||||
|
218
icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java
Normal file
218
icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java
Normal file
@ -0,0 +1,218 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.Stack;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
|
||||
import static com.ibm.icu.impl.CharacterIteration.*;
|
||||
|
||||
public class CjkBreakEngine implements LanguageBreakEngine {
|
||||
private static final UnicodeSet fHangulWordSet = new UnicodeSet();
|
||||
private static final UnicodeSet fHanWordSet = new UnicodeSet();
|
||||
private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
|
||||
private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
|
||||
static {
|
||||
fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
|
||||
fHanWordSet.applyPattern("[:Han:]");
|
||||
fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
|
||||
fHiraganaWordSet.applyPattern("[:Hiragana:]");
|
||||
|
||||
// freeze them all
|
||||
fHangulWordSet.freeze();
|
||||
fHanWordSet.freeze();
|
||||
fKatakanaWordSet.freeze();
|
||||
fHiraganaWordSet.freeze();
|
||||
}
|
||||
|
||||
private final UnicodeSet fWordSet;
|
||||
private DictionaryMatcher fDictionary = null;
|
||||
|
||||
public CjkBreakEngine(boolean korean) throws IOException {
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Hira");
|
||||
if (korean) {
|
||||
fWordSet = fHangulWordSet;
|
||||
} else {
|
||||
fWordSet = new UnicodeSet();
|
||||
fWordSet.addAll(fHanWordSet);
|
||||
fWordSet.addAll(fKatakanaWordSet);
|
||||
fWordSet.addAll(fHiraganaWordSet);
|
||||
fWordSet.add("\\uff70\\u30fc");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean handles(int c, int breakType) {
|
||||
return (breakType == BreakIterator.KIND_WORD) &&
|
||||
(fWordSet.contains(c));
|
||||
}
|
||||
|
||||
private static final int kMaxKatakanaLength = 8;
|
||||
private static final int kMaxKatakanaGroupLength = 20;
|
||||
private static final int maxSnlp = 255;
|
||||
private static final int kint32max = Integer.MAX_VALUE;
|
||||
private static int getKatakanaCost(int wordlength) {
|
||||
int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
|
||||
return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
|
||||
}
|
||||
|
||||
private static boolean isKatakana(int value) {
|
||||
return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
|
||||
(value >= 0xFF66 && value <= 0xFF9F);
|
||||
}
|
||||
|
||||
public int findBreaks(CharacterIterator inText, int startPos, int endPos,
|
||||
boolean reverse, int breakType, Stack<Integer> foundBreaks) {
|
||||
if (startPos >= endPos) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
inText.setIndex(startPos);
|
||||
|
||||
int inputLength = endPos - startPos;
|
||||
int[] charPositions = new int[inputLength + 1];
|
||||
StringBuffer s = new StringBuffer("");
|
||||
inText.setIndex(startPos);
|
||||
while (inText.getIndex() < endPos) {
|
||||
s.append(inText.current());
|
||||
inText.next();
|
||||
}
|
||||
String prenormstr = s.toString();
|
||||
boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
|
||||
Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
|
||||
CharacterIterator text = inText;
|
||||
int numChars = 0;
|
||||
if (isNormalized) {
|
||||
int index = 0;
|
||||
charPositions[0] = 0;
|
||||
while (index < prenormstr.length()) {
|
||||
int codepoint = prenormstr.codePointAt(index);
|
||||
index += Character.charCount(codepoint);
|
||||
numChars++;
|
||||
charPositions[numChars] = index;
|
||||
}
|
||||
} else {
|
||||
String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
|
||||
text = new java.text.StringCharacterIterator(normStr);
|
||||
charPositions = new int[normStr.length() + 1];
|
||||
Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
|
||||
int index = 0;
|
||||
charPositions[0] = 0;
|
||||
while (index < normalizer.endIndex()) {
|
||||
normalizer.next();
|
||||
numChars++;
|
||||
index = normalizer.getIndex();
|
||||
charPositions[numChars] = index;
|
||||
}
|
||||
}
|
||||
|
||||
// From here on out, do the algorithm. Note that our indices
|
||||
// refer to indices within the normalized string.
|
||||
int[] bestSnlp = new int[numChars + 1];
|
||||
bestSnlp[0] = 0;
|
||||
for (int i = 1; i <= numChars; i++) {
|
||||
bestSnlp[i] = kint32max;
|
||||
}
|
||||
|
||||
int[] prev = new int[numChars + 1];
|
||||
for (int i = 0; i <= numChars; i++) {
|
||||
prev[i] = -1;
|
||||
}
|
||||
|
||||
final int maxWordSize = 20;
|
||||
int values[] = new int[numChars];
|
||||
int lengths[] = new int[numChars];
|
||||
// dynamic programming to find the best segmentation
|
||||
boolean is_prev_katakana = false;
|
||||
for (int i = 0; i < numChars; i++) {
|
||||
text.setIndex(i);
|
||||
if (bestSnlp[i] == kint32max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
|
||||
int[] count_ = new int[1];
|
||||
fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
|
||||
int count = count_[0];
|
||||
|
||||
// if there are no single character matches found in the dictionary
|
||||
// starting with this character, treat character as a 1-character word
|
||||
// with the highest value possible (i.e. the least likely to occur).
|
||||
// Exclude Korean characters from this treatment, as they should be
|
||||
// left together by default.
|
||||
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
|
||||
values[count] = maxSnlp;
|
||||
lengths[count] = 1;
|
||||
count++;
|
||||
}
|
||||
|
||||
for (int j = 0; j < count; j++) {
|
||||
int newSnlp = bestSnlp[i] + values[j];
|
||||
if (newSnlp < bestSnlp[lengths[j] + i]) {
|
||||
bestSnlp[lengths[j] + i] = newSnlp;
|
||||
prev[lengths[j] + i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// In Japanese, single-character Katakana words are pretty rare.
|
||||
// So we apply the following heuristic to Katakana: any continuous
|
||||
// run of Katakana characters is considered a candidate word with
|
||||
// a default cost specified in the katakanaCost table according
|
||||
// to its length.
|
||||
text.setIndex(i);
|
||||
boolean is_katakana = isKatakana(current32(text));
|
||||
if (!is_prev_katakana && is_katakana) {
|
||||
int j = i + 1;
|
||||
next32(text);
|
||||
while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
|
||||
next32(text);
|
||||
++j;
|
||||
}
|
||||
|
||||
if ((j - i) < kMaxKatakanaGroupLength) {
|
||||
int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
|
||||
if (newSnlp < bestSnlp[j]) {
|
||||
bestSnlp[j] = newSnlp;
|
||||
prev[j] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
is_prev_katakana = is_katakana;
|
||||
}
|
||||
|
||||
int t_boundary[] = new int[numChars + 1];
|
||||
int numBreaks = 0;
|
||||
if (bestSnlp[numChars] == kint32max) {
|
||||
t_boundary[numBreaks] = numChars;
|
||||
numBreaks++;
|
||||
} else {
|
||||
for (int i = numChars; i > 0; i = prev[i]) {
|
||||
t_boundary[numBreaks] = i;
|
||||
numBreaks++;
|
||||
}
|
||||
Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
|
||||
}
|
||||
|
||||
if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
|
||||
t_boundary[numBreaks++] = 0;
|
||||
}
|
||||
|
||||
for (int i = numBreaks - 1; i >= 0; i--) {
|
||||
int pos = charPositions[t_boundary[i]] + startPos;
|
||||
if (!(foundBreaks.contains(pos) || pos == startPos))
|
||||
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
|
||||
}
|
||||
|
||||
if (!foundBreaks.empty() && foundBreaks.peek() == endPos)
|
||||
foundBreaks.pop();
|
||||
if (!foundBreaks.empty())
|
||||
inText.setIndex(foundBreaks.peek());
|
||||
return 0;
|
||||
}
|
||||
}
|
@ -1,565 +0,0 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Stack;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
|
||||
|
||||
/**
|
||||
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
|
||||
* to further subdivide ranges of text beyond what is possible using just the
|
||||
* state-table-based algorithm. This is necessary, for example, to handle
|
||||
* word and line breaking in Thai, which doesn't use spaces between words. The
|
||||
* state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
|
||||
* up text as far as possible, and then contiguous ranges of letters are
|
||||
* repeatedly compared against a list of known words (i.e., the dictionary)
|
||||
* to divide them up into words.
|
||||
*
|
||||
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
|
||||
* but adds one more special substitution name: _dictionary_. This substitution
|
||||
* name is used to identify characters in words in the dictionary. The idea is that
|
||||
* if the iterator passes over a chunk of text that includes two or more characters
|
||||
* in a row that are included in _dictionary_, it goes back through that range and
|
||||
* derives additional break positions (if possible) using the dictionary.
|
||||
*
|
||||
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
|
||||
* file. It uses Class.getResource() to locate the dictionary file. The
|
||||
* dictionary file is in a serialized binary format. We have a very primitive (and
|
||||
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
|
||||
* currently making it public. Contact us for help.
|
||||
*
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
|
||||
|
||||
/**
|
||||
* Keeps track of if we are using the compact trie dictionary.
|
||||
*/
|
||||
private boolean usingCTDictionary = false;
|
||||
/**
|
||||
* a list of known words that is used to divide up contiguous ranges of letters,
|
||||
* stored in a compressed, indexed, format that offers fast access
|
||||
*/
|
||||
private BreakDictionary dictionary;
|
||||
|
||||
/*
|
||||
* a list of flags indicating which character categories are contained in
|
||||
* the dictionary file (this is used to determine which ranges of characters
|
||||
* to apply the dictionary to)
|
||||
*/
|
||||
//private boolean[] categoryFlags;
|
||||
|
||||
|
||||
/**
|
||||
* when a range of characters is divided up using the dictionary, the break
|
||||
* positions that are discovered are stored here, preventing us from having
|
||||
* to use either the dictionary or the state table again until the iterator
|
||||
* leaves this range of text
|
||||
*/
|
||||
int[] cachedBreakPositions;
|
||||
|
||||
/**
|
||||
* if cachedBreakPositions is not null, this indicates which item in the
|
||||
* cache the current iteration position refers to
|
||||
*/
|
||||
int positionInCache;
|
||||
|
||||
/**
|
||||
* Special variable name for characters in words in dictionary
|
||||
*/
|
||||
|
||||
/**
|
||||
* Construct a DictionarBasedBreakIterator from precompiled rules. Use by ThaiBreakEngine
|
||||
* uses the BreakCTDictionary.
|
||||
* @param compiledRules an input stream containing the binary (flattened) compiled rules.
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
protected DictionaryBasedBreakIterator(InputStream compiledRules) throws IOException {
|
||||
fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
|
||||
dictionary = null;
|
||||
usingCTDictionary = true;
|
||||
}
|
||||
/**
|
||||
* Constructs a DictionaryBasedBreakIterator.
|
||||
* @param rules Same as the rules parameter on RuleBasedBreakIterator,
|
||||
* except for the special meaning of "_dictionary_". This parameter is just
|
||||
* passed through to RuleBasedBreakIterator constructor.
|
||||
* @param dictionaryStream the stream containing the dictionary data
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public DictionaryBasedBreakIterator(String rules,
|
||||
InputStream dictionaryStream) throws IOException {
|
||||
super(rules);
|
||||
dictionary = new BreakDictionary(dictionaryStream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Construct a DictionarBasedBreakIterator from precompiled rules.
|
||||
* @param compiledRules an input stream containing the binary (flattened) compiled rules.
|
||||
* @param dictionaryStream an input stream containing the dictionary data
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public DictionaryBasedBreakIterator(InputStream compiledRules,
|
||||
InputStream dictionaryStream) throws IOException {
|
||||
fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
|
||||
dictionary = new BreakDictionary(dictionaryStream);
|
||||
}
|
||||
|
||||
|
||||
/** @stable ICU 2.0 */
|
||||
public void setText(CharacterIterator newText) {
|
||||
super.setText(newText);
|
||||
cachedBreakPositions = null;
|
||||
fDictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the beginning of the text.
|
||||
* (i.e., the CharacterIterator's starting offset).
|
||||
* @return The offset of the beginning of the text.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int first() {
|
||||
cachedBreakPositions = null;
|
||||
fDictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
return super.first();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the end of the text.
|
||||
* (i.e., the CharacterIterator's ending offset).
|
||||
* @return The text's past-the-end offset.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int last() {
|
||||
cachedBreakPositions = null;
|
||||
fDictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
return super.last();
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances the iterator one step backwards.
|
||||
* @return The position of the last boundary position before the
|
||||
* current iteration position
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int previous() {
|
||||
CharacterIterator text = getText();
|
||||
|
||||
// if we have cached break positions and we're still in the range
|
||||
// covered by them, just move one step backward in the cache
|
||||
if (cachedBreakPositions != null && positionInCache > 0) {
|
||||
--positionInCache;
|
||||
text.setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
|
||||
// otherwise, dump the cache and use the inherited previous() method to move
|
||||
// backward. This may fill up the cache with new break positions, in which
|
||||
// case we have to mark our position in the cache. If it doesn't, use next()
|
||||
// to move forward until we hit or pass the current position. This *will* fill
|
||||
// the cache.
|
||||
else {
|
||||
cachedBreakPositions = null;
|
||||
int offset = current();
|
||||
int result = super.previous();
|
||||
|
||||
if (cachedBreakPositions != null) {
|
||||
positionInCache = cachedBreakPositions.length - 2;
|
||||
return result;
|
||||
}
|
||||
|
||||
while (result < offset) {
|
||||
int nextResult = next();
|
||||
|
||||
if (nextResult >= offset) {
|
||||
break;
|
||||
}
|
||||
|
||||
result = nextResult;
|
||||
}
|
||||
|
||||
if (cachedBreakPositions != null) {
|
||||
positionInCache = cachedBreakPositions.length - 2;
|
||||
}
|
||||
|
||||
if (result != BreakIterator.DONE) {
|
||||
text.setIndex(result);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the last boundary position
|
||||
* before the specified position.
|
||||
* @param offset The position to begin searching from
|
||||
* @return The position of the last boundary before "offset"
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int preceding(int offset) {
|
||||
CharacterIterator text = getText();
|
||||
checkOffset(offset, text);
|
||||
|
||||
// if we have no cached break positions, or "offset" is outside the
|
||||
// range covered by the cache, we can just call the inherited routine
|
||||
// (which will eventually call other routines in this class that may
|
||||
// refresh the cache)
|
||||
if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
|
||||
offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
|
||||
cachedBreakPositions = null;
|
||||
return super.preceding(offset);
|
||||
}
|
||||
|
||||
// on the other hand, if "offset" is within the range covered by the cache,
|
||||
// then all we have to do is search the cache for the last break position
|
||||
// before "offset"
|
||||
else {
|
||||
positionInCache = 0;
|
||||
while (positionInCache < cachedBreakPositions.length
|
||||
&& offset > cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
--positionInCache;
|
||||
text.setIndex(cachedBreakPositions[positionInCache]);
|
||||
return text.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the first boundary position after
|
||||
* the specified position.
|
||||
* @param offset The position to begin searching forward from
|
||||
* @return The position of the first boundary after "offset"
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int following(int offset) {
|
||||
CharacterIterator text = getText();
|
||||
checkOffset(offset, text);
|
||||
|
||||
// if we have no cached break positions, or if "offset" is outside the
|
||||
// range covered by the cache, then dump the cache and call our
|
||||
// inherited following() method. This will call other methods in this
|
||||
// class that may refresh the cache.
|
||||
if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
|
||||
offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
|
||||
cachedBreakPositions = null;
|
||||
return super.following(offset);
|
||||
}
|
||||
|
||||
// on the other hand, if "offset" is within the range covered by the
|
||||
// cache, then just search the cache for the first break position
|
||||
// after "offset"
|
||||
else {
|
||||
positionInCache = 0;
|
||||
while (positionInCache < cachedBreakPositions.length
|
||||
&& offset >= cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
text.setIndex(cachedBreakPositions[positionInCache]);
|
||||
return text.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the status tag from the break rule that determined the most recently
|
||||
* returned break position.
|
||||
*
|
||||
* TODO: not supported with dictionary based break iterators.
|
||||
*
|
||||
* @return the status from the break rule that determined the most recently
|
||||
* returned break position.
|
||||
* @draft ICU 3.0
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getRuleStatus() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the status (tag) values from the break rule(s) that determined the most
|
||||
* recently returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. The default status value for rules
|
||||
* that do not explicitly provide one is zero.
|
||||
* <p>
|
||||
* TODO: not supported for dictionary based break iterator.
|
||||
*
|
||||
* @param fillInArray an array to be filled in with the status values.
|
||||
* @return The number of rule status values from rules that determined
|
||||
* the most recent boundary returned by the break iterator.
|
||||
* In the event that the array is too small, the return value
|
||||
* is the total number of status values that were available,
|
||||
* not the reduced number that were actually returned.
|
||||
* @draft ICU 3.0
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getRuleStatusVec(int[] fillInArray) {
|
||||
if (fillInArray != null && fillInArray.length>=1) {
|
||||
fillInArray[0] = 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
/**
|
||||
* This is the implementation function for next().
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
protected int handleNext() {
|
||||
CharacterIterator text = getText();
|
||||
|
||||
// if there are no cached break positions, or if we've just moved
|
||||
// off the end of the range covered by the cache, we have to dump
|
||||
// and possibly regenerate the cache
|
||||
if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
|
||||
|
||||
// start by using the inherited handleNext() to find a tentative return
|
||||
// value. dictionaryCharCount tells us how many dictionary characters
|
||||
// we passed over on our way to the tentative return value
|
||||
int startPos = text.getIndex();
|
||||
fDictionaryCharCount = 0;
|
||||
int result = super.handleNext();
|
||||
|
||||
// if we passed over more than one dictionary character, then we use
|
||||
// divideUpDictionaryRange() to regenerate the cached break positions
|
||||
// for the new range.
|
||||
if (!usingCTDictionary && fDictionaryCharCount > 1 && result - startPos > 1) {
|
||||
divideUpDictionaryRange(startPos, result);
|
||||
}
|
||||
|
||||
// otherwise, the value we got back from the inherited fuction
|
||||
// is our return value, and we can dump the cache
|
||||
else {
|
||||
cachedBreakPositions = null;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// if the cache of break positions has been regenerated (or existed all
|
||||
// along), then just advance to the next break position in the cache
|
||||
// and return it
|
||||
if (cachedBreakPositions != null) {
|
||||
++positionInCache;
|
||||
text.setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
///CLOVER:OFF
|
||||
Assert.assrt(false);
|
||||
return -9999; // SHOULD NEVER GET HERE!
|
||||
///CLOVER:ON
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the function that actually implements the dictionary-based
|
||||
* algorithm. Given the endpoints of a range of text, it uses the
|
||||
* dictionary to determine the positions of any boundaries in this
|
||||
* range. It stores all the boundary positions it discovers in
|
||||
* cachedBreakPositions so that we only have to do this work once
|
||||
* for each time we enter the range.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void divideUpDictionaryRange(int startPos, int endPos) {
|
||||
CharacterIterator text = getText();
|
||||
|
||||
// the range we're dividing may begin or end with non-dictionary characters
|
||||
// (i.e., for line breaking, we may have leading or trailing punctuation
|
||||
// that needs to be kept with the word). Seek from the beginning of the
|
||||
// range to the first dictionary character
|
||||
text.setIndex(startPos);
|
||||
int c = CICurrent32(text);
|
||||
while (isDictionaryChar(c) == false) {
|
||||
c = CINext32(text);
|
||||
}
|
||||
|
||||
//System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos);
|
||||
|
||||
// initialize. We maintain two stacks: currentBreakPositions contains
|
||||
// the list of break positions that will be returned if we successfully
|
||||
// finish traversing the whole range now. possibleBreakPositions lists
|
||||
// all other possible word ends we've passed along the way. (Whenever
|
||||
// we reach an error [a sequence of characters that can't begin any word
|
||||
// in the dictionary], we back up, possibly delete some breaks from
|
||||
// currentBreakPositions, move a break from possibleBreakPositions
|
||||
// to currentBreakPositions, and start over from there. This process
|
||||
// continues in this way until we either successfully make it all the way
|
||||
// across the range, or exhaust all of our combinations of break
|
||||
// positions.)
|
||||
Stack<Integer> currentBreakPositions = new Stack<Integer>();
|
||||
Stack<Integer> possibleBreakPositions = new Stack<Integer>();
|
||||
List<Integer> wrongBreakPositions = new ArrayList<Integer>();
|
||||
|
||||
// the dictionary is implemented as a trie, which is treated as a state
|
||||
// machine. -1 represents the end of a legal word. Every word in the
|
||||
// dictionary is represented by a path from the root node to -1. A path
|
||||
// that ends in state 0 is an illegal combination of characters.
|
||||
int state = 0;
|
||||
|
||||
// these two variables are used for error handling. We keep track of the
|
||||
// farthest we've gotten through the range being divided, and the combination
|
||||
// of breaks that got us that far. If we use up all possible break
|
||||
// combinations, the text contains an error or a word that's not in the
|
||||
// dictionary. In this case, we "bless" the break positions that got us the
|
||||
// farthest as real break positions, and then start over from scratch with
|
||||
// the character where the error occurred.
|
||||
int farthestEndPoint = text.getIndex();
|
||||
Stack<Integer> bestBreakPositions = null;
|
||||
|
||||
// initialize (we always exit the loop with a break statement)
|
||||
c = CICurrent32(text);
|
||||
while (true) {
|
||||
//System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex());
|
||||
|
||||
// if we can transition to state "-1" from our current state, we're
|
||||
// on the last character of a legal word. Push that position onto
|
||||
// the possible-break-positions stack
|
||||
if (dictionary.at(state, 0) == -1) {
|
||||
possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
|
||||
}
|
||||
|
||||
// look up the new state to transition to in the dictionary
|
||||
// There will be no supplementaries here because the Thai dictionary
|
||||
// does not include any. This code is going away soon, not worth
|
||||
// fixing.
|
||||
state = (dictionary.at(state, (char)c)) & 0xFFFF; // TODO: fix supplementaries
|
||||
//System.out.print(", state = " + state);
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the "end of word" state, then it was a non-dictionary character
|
||||
// and we've successfully traversed the whole range. Drop out
|
||||
// of the loop.
|
||||
if (state == /*-1*/ 0xFFFF) {
|
||||
currentBreakPositions.push(Integer.valueOf(text.getIndex()));
|
||||
break;
|
||||
}
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the error state, or if we've gone off the end of the range
|
||||
// without transitioning to the "end of word" state, we've hit
|
||||
// an error...
|
||||
else if (state == 0 || text.getIndex() >= endPos) {
|
||||
|
||||
// if this is the farthest we've gotten, take note of it in
|
||||
// case there's an error in the text
|
||||
if (text.getIndex() > farthestEndPoint) {
|
||||
farthestEndPoint = text.getIndex();
|
||||
bestBreakPositions = (Stack<Integer>)(currentBreakPositions.clone());
|
||||
}
|
||||
|
||||
// wrongBreakPositions is a list of all break positions we've tried starting
|
||||
// that didn't allow us to traverse all the way through the text. Every time
|
||||
// we pop a break position off of currentBreakPositions, we put it into
|
||||
// wrongBreakPositions to avoid trying it again later. If we make it to this
|
||||
// spot, we're either going to back up to a break in possibleBreakPositions
|
||||
// and try starting over from there, or we've exhausted all possible break
|
||||
// positions and are going to do the fallback procedure. This loop prevents
|
||||
// us from messing with anything in possibleBreakPositions that didn't work as
|
||||
// a starting point the last time we tried it (this is to prevent a bunch of
|
||||
// repetitive checks from slowing down some extreme cases)
|
||||
// variable not used Integer newStartingSpot = null;
|
||||
while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
|
||||
possibleBreakPositions.peek())) {
|
||||
possibleBreakPositions.pop();
|
||||
}
|
||||
|
||||
// if we've used up all possible break-position combinations, there's
|
||||
// an error or an unknown word in the text. In this case, we start
|
||||
// over, treating the farthest character we've reached as the beginning
|
||||
// of the range, and "blessing" the break positions that got us that
|
||||
// far as real break positions
|
||||
if (possibleBreakPositions.isEmpty()) {
|
||||
if (bestBreakPositions != null) {
|
||||
currentBreakPositions = bestBreakPositions;
|
||||
if (farthestEndPoint < endPos) {
|
||||
text.setIndex(farthestEndPoint + 1);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ((currentBreakPositions.size() == 0
|
||||
|| currentBreakPositions.peek().intValue() != text.getIndex())
|
||||
&& text.getIndex() != startPos) {
|
||||
currentBreakPositions.push(Integer.valueOf(text.getIndex()));
|
||||
}
|
||||
CINext32(text);
|
||||
currentBreakPositions.push(Integer.valueOf(text.getIndex()));
|
||||
}
|
||||
}
|
||||
|
||||
// if we still have more break positions we can try, then promote the
|
||||
// last break in possibleBreakPositions into currentBreakPositions,
|
||||
// and get rid of all entries in currentBreakPositions that come after
|
||||
// it. Then back up to that position and start over from there (i.e.,
|
||||
// treat that position as the beginning of a new word)
|
||||
else {
|
||||
Integer temp = possibleBreakPositions.pop();
|
||||
Integer temp2 = null;
|
||||
while (!currentBreakPositions.isEmpty() && temp.intValue() <
|
||||
currentBreakPositions.peek().intValue()) {
|
||||
temp2 = currentBreakPositions.pop();
|
||||
wrongBreakPositions.add(temp2);
|
||||
}
|
||||
currentBreakPositions.push(temp);
|
||||
text.setIndex(currentBreakPositions.peek().intValue());
|
||||
}
|
||||
|
||||
// re-sync "c" for the next go-round, and drop out of the loop if
|
||||
// we've made it off the end of the range
|
||||
c = CICurrent32(text);
|
||||
state = 0;
|
||||
if (text.getIndex() >= endPos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if we didn't hit any exceptional conditions on this last iteration,
|
||||
// just advance to the next character and loop
|
||||
else {
|
||||
c = CINext32(text);
|
||||
}
|
||||
//System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}");
|
||||
//System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}");
|
||||
}
|
||||
|
||||
// dump the last break position in the list, and replace it with the actual
|
||||
// end of the range (which may be the same character, or may be further on
|
||||
// because the range actually ended with non-dictionary characters we want to
|
||||
// keep with the word)
|
||||
if (!currentBreakPositions.isEmpty()) {
|
||||
currentBreakPositions.pop();
|
||||
}
|
||||
currentBreakPositions.push(Integer.valueOf(endPos));
|
||||
|
||||
// create a regular array to hold the break positions and copy
|
||||
// the break positions from the stack to the array (in addition,
|
||||
// our starting position goes into this array as a break position).
|
||||
// This array becomes the cache of break positions used by next()
|
||||
// and previous(), so this is where we actually refresh the cache.
|
||||
cachedBreakPositions = new int[currentBreakPositions.size() + 1];
|
||||
cachedBreakPositions[0] = startPos;
|
||||
|
||||
for (int i = 0; i < currentBreakPositions.size(); i++) {
|
||||
cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
|
||||
}
|
||||
positionInCache = 0;
|
||||
}
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.Stack;
|
||||
|
||||
abstract class DictionaryBreakEngine implements LanguageBreakEngine {
|
||||
protected UnicodeSet fSet = new UnicodeSet();
|
||||
private final int fTypes;
|
||||
|
||||
/**
|
||||
* @param breakTypes A mask of the break iterators that can use this engine.
|
||||
* For example, (1 << KIND_WORD) | (1 << KIND_LINE) could be used by
|
||||
* word iterators and line iterators, but not any other kind.
|
||||
*/
|
||||
public DictionaryBreakEngine(int breakTypes) {
|
||||
// TODO: consider using a java.util.BitSet with nbits <= 32
|
||||
fTypes = breakTypes;
|
||||
}
|
||||
|
||||
public boolean handles(int c, int breakType) {
|
||||
return (breakType >= 0 && breakType < 32) && // breakType is in range
|
||||
((1 << breakType) & fTypes) != 0 && // this type can use us
|
||||
fSet.contains(c); // we recognize the character
|
||||
}
|
||||
|
||||
public int findBreaks(CharacterIterator text_, int startPos, int endPos,
|
||||
boolean reverse, int breakType, Stack<Integer> foundBreaks) {
|
||||
if (breakType < 0 || breakType >= 32 ||
|
||||
((1 << breakType) & fTypes) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int result = 0;
|
||||
UCharacterIterator text = UCharacterIterator.getInstance(text_);
|
||||
int start = text.getIndex();
|
||||
int current, rangeStart, rangeEnd;
|
||||
int c = text.current();
|
||||
if (reverse) {
|
||||
boolean isDict = fSet.contains(c);
|
||||
while ((current = text.getIndex()) > startPos && isDict) {
|
||||
c = text.previous();
|
||||
isDict = fSet.contains(c);
|
||||
}
|
||||
rangeStart = (current < startPos) ? startPos :
|
||||
current + (isDict ? 0 : 1);
|
||||
rangeEnd = start + 1;
|
||||
} else {
|
||||
while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
|
||||
c = text.next();
|
||||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
}
|
||||
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
text.setIndex(current);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected abstract int divideUpDictionaryRange(UCharacterIterator text,
|
||||
int rangeStart, int rangeEnd, Stack<Integer> foundBreaks);
|
||||
}
|
@ -0,0 +1,90 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.impl.ICUBinary;
|
||||
import com.ibm.icu.impl.ICUData;
|
||||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.util.UResourceBundle;
|
||||
|
||||
final class DictionaryData {
|
||||
// disallow instantiation
|
||||
private DictionaryData() { }
|
||||
|
||||
public static final int TRIE_TYPE_BYTES = 0;
|
||||
public static final int TRIE_TYPE_UCHARS = 1;
|
||||
public static final int TRIE_TYPE_MASK = 7;
|
||||
public static final int TRIE_HAS_VALUES = 8;
|
||||
public static final int TRANSFORM_NONE = 0;
|
||||
public static final int TRANSFORM_TYPE_OFFSET = 0x1000000;
|
||||
public static final int TRANSFORM_TYPE_MASK = 0x7f000000;
|
||||
public static final int TRANSFORM_OFFSET_MASK = 0x1fffff;
|
||||
|
||||
public static final int IX_STRING_TRIE_OFFSET = 0;
|
||||
public static final int IX_RESERVED1_OFFSET = 1;
|
||||
public static final int IX_RESERVED2_OFFSET = 2;
|
||||
public static final int IX_TOTAL_SIZE = 3;
|
||||
public static final int IX_TRIE_TYPE = 4;
|
||||
public static final int IX_TRANSFORM = 5;
|
||||
public static final int IX_RESERVED6 = 6;
|
||||
public static final int IX_RESERVED7 = 7;
|
||||
public static final int IX_COUNT = 8;
|
||||
|
||||
private static final byte DATA_FORMAT_ID[] = { (byte) 0x44, (byte) 0x69,
|
||||
(byte) 0x63, (byte) 0x74 };
|
||||
|
||||
public static DictionaryMatcher loadDictionaryFor(String dictType) throws IOException {
|
||||
ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME);
|
||||
String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
|
||||
dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
|
||||
InputStream is = ICUData.getStream(dictFileName);
|
||||
ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
|
||||
DataInputStream s = new DataInputStream(is);
|
||||
int[] indexes = new int[IX_COUNT];
|
||||
// TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[]
|
||||
for (int i = 0; i < IX_COUNT; i++) {
|
||||
indexes[i] = s.readInt();
|
||||
}
|
||||
int offset = indexes[IX_STRING_TRIE_OFFSET];
|
||||
Assert.assrt(offset >= (4 * IX_COUNT));
|
||||
if (offset > (4 * IX_COUNT)) {
|
||||
int diff = offset - (4 * IX_COUNT);
|
||||
s.skipBytes(diff);
|
||||
}
|
||||
int trieType = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK;
|
||||
int totalSize = indexes[IX_TOTAL_SIZE] - offset;
|
||||
DictionaryMatcher m = null;
|
||||
if (trieType == TRIE_TYPE_BYTES) {
|
||||
int transform = indexes[IX_TRANSFORM];
|
||||
byte[] data = new byte[totalSize];
|
||||
int i;
|
||||
for (i = 0; i < data.length; i++) {
|
||||
data[i] = s.readByte();
|
||||
}
|
||||
Assert.assrt(i == totalSize);
|
||||
m = new BytesDictionaryMatcher(data, transform);
|
||||
} else if (trieType == TRIE_TYPE_UCHARS) {
|
||||
Assert.assrt(totalSize % 2 == 0);
|
||||
int num = totalSize / 2;
|
||||
char[] data = new char[totalSize / 2];
|
||||
for (int i = 0; i < num; i++) {
|
||||
data[i] = s.readChar();
|
||||
}
|
||||
m = new CharsDictionaryMatcher(new String(data));
|
||||
} else {
|
||||
m = null;
|
||||
}
|
||||
s.close();
|
||||
is.close();
|
||||
return m;
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/**
|
||||
* The DictionaryMatcher interface is used to allow arbitrary "types" of
|
||||
* back-end data structures to be used with the break iteration code.
|
||||
*/
|
||||
abstract class DictionaryMatcher {
|
||||
/**
|
||||
* Find dictionary words that match the text.
|
||||
*
|
||||
* @param text A CharacterIterator representing the text. The iterator is
|
||||
* left after the longest prefix match in the dictionary.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The maximum amount of words to output. Must be less than or equal to lengths.length.
|
||||
* @param values Filled with the weight values associated with the various words.
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
public abstract int matches(CharacterIterator text, int maxLength, int[] lengths,
|
||||
int[] count, int limit, int[] values);
|
||||
|
||||
public int matches(CharacterIterator text, int maxLength, int[] lengths,
|
||||
int[] count, int limit) {
|
||||
return matches(text, maxLength, lengths, count, limit, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the kind of dictionary that this matcher is using
|
||||
*/
|
||||
public abstract int getType();
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
* The LanguageBreakEngine interface is to be used to implement any
|
||||
* language-specific logic for break iteration.
|
||||
*/
|
||||
interface LanguageBreakEngine {
|
||||
/**
|
||||
* @param c A Unicode codepoint value
|
||||
* @param breakType The kind of break iterator that is wanting to make use
|
||||
* of this engine - character, word, line, sentence
|
||||
* @return true if the engine can handle this character, false otherwise
|
||||
*/
|
||||
public boolean handles(int c, int breakType);
|
||||
|
||||
/**
|
||||
* Implements the actual breaking logic.
|
||||
* @param text The text to break over
|
||||
* @param startPos The index of the beginning of our range
|
||||
* @param endPos The index of the possible end of our range. It is possible,
|
||||
* however, that our range ends earlier
|
||||
* @param reverse true iff we are iterating backwards (in a call to
|
||||
* previous(), for example)
|
||||
* @param breakType The kind of break iterator that is wanting to make use
|
||||
* of this engine - character, word, line, sentence
|
||||
* @param foundBreaks A Stack that the breaks found will be added to
|
||||
* @return the number of words found
|
||||
*/
|
||||
public int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
boolean reverse, int breakType, Stack<Integer> foundBreaks);
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,20 +1,20 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2011, International Business Machines Corporation and *
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.Stack;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
|
||||
class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
||||
public class ThaiBreakEngine implements LanguageBreakEngine {
|
||||
/* Helper class for improving readability of the Thai word break
|
||||
* algorithm.
|
||||
*/
|
||||
@ -25,7 +25,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
//list of word candidate lengths, in increasing length order
|
||||
private int lengths[];
|
||||
private int count[]; // Count of candidates
|
||||
private int prefix; // The longeset match with a dictionary word
|
||||
private int prefix; // The longest match with a dictionary word
|
||||
private int offset; // Offset in the text of these candidates
|
||||
private int mark; // The preferred candidate's offset
|
||||
private int current; // The candidate we're currently looking at
|
||||
@ -38,7 +38,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
}
|
||||
|
||||
// Fill the list of candidates if needed, select the longest, and return the number found
|
||||
public int candidates(CharacterIterator fIter, BreakCTDictionary dict, int rangeEnd) {
|
||||
public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
|
||||
int start = fIter.getIndex();
|
||||
if (start != offset) {
|
||||
offset = start;
|
||||
@ -62,7 +62,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
return lengths[mark];
|
||||
}
|
||||
|
||||
// Backup from the current candidate to the next shorter one; rreturn true if that exists
|
||||
// Backup from the current candidate to the next shorter one; return true if that exists
|
||||
// and point the text after it
|
||||
public boolean backUp(CharacterIterator fIter) {
|
||||
if (current > 0) {
|
||||
@ -82,14 +82,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
mark = current;
|
||||
}
|
||||
}
|
||||
|
||||
private static UnicodeSet fThaiWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fSuffixSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
private BreakCTDictionary fDictionary;
|
||||
|
||||
|
||||
// Constants for ThaiBreakIterator
|
||||
// How many words in a row are "good enough"?
|
||||
private static final byte THAI_LOOKAHEAD = 3;
|
||||
@ -104,9 +97,14 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
private static final char THAI_MAIYAMOK = 0x0E46;
|
||||
// Minimum word size
|
||||
private static final byte THAI_MIN_WORD = 2;
|
||||
// Minimum number of characters for two words
|
||||
//private final int THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
|
||||
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fThaiWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fSuffixSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
|
||||
static {
|
||||
// Initialize UnicodeSets
|
||||
fThaiWordSet = new UnicodeSet();
|
||||
@ -141,73 +139,28 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
fBeginWordSet.freeze();
|
||||
fSuffixSet.freeze();
|
||||
}
|
||||
|
||||
public ThaiBreakIterator(InputStream ruleStream, InputStream dictionaryStream) throws IOException {
|
||||
super(ruleStream);
|
||||
// Initialize diciontary
|
||||
fDictionary = new BreakCTDictionary(dictionaryStream);
|
||||
|
||||
public ThaiBreakEngine() throws IOException {
|
||||
// Initialize dictionary
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Thai");
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the implementation function for next().
|
||||
*/
|
||||
protected int handleNext() {
|
||||
CharacterIterator text = getText();
|
||||
|
||||
// if there are no cached break positions, or if we've just moved
|
||||
// off the end of the range covered by the cache, we have to dump
|
||||
// and possibly regenerate the cache
|
||||
if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
|
||||
|
||||
// start by using the inherited handleNext() to find a tentative return
|
||||
// value. dictionaryCharCount tells us how many dictionary characters
|
||||
// we passed over on our way to the tentative return value
|
||||
int startPos = text.getIndex();
|
||||
fDictionaryCharCount = 0;
|
||||
int result = super.handleNext();
|
||||
|
||||
// if we passed over more than one dictionary character, then we use
|
||||
// divideUpDictionaryRange() to regenerate the cached break positions
|
||||
// for the new range
|
||||
if (fDictionaryCharCount > 1 && result - startPos > 1) {
|
||||
divideUpDictionaryRange(startPos, result);
|
||||
}
|
||||
|
||||
// otherwise, the value we got back from the inherited fuction
|
||||
// is our return value, and we can dump the cache
|
||||
else {
|
||||
cachedBreakPositions = null;
|
||||
return result;
|
||||
}
|
||||
public boolean handles(int c, int breakType) {
|
||||
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.THAI);
|
||||
}
|
||||
// if the cache of break positions has been regenerated (or existed all
|
||||
// along), then just advance to the next break position in the cache
|
||||
// and return it
|
||||
if (cachedBreakPositions != null) {
|
||||
++positionInCache;
|
||||
text.setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
Assert.assrt(false);
|
||||
return -9999; // SHOULD NEVER GET HERE!
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Divide up a range of known dictionary characters.
|
||||
*
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
private int divideUpDictionaryRange(int rangeStart, int rangeEnd) {
|
||||
public int findBreaks(CharacterIterator fIter, int rangeStart, int rangeEnd, boolean reverse, int breakType,
|
||||
Stack<Integer> foundBreaks) {
|
||||
if ((rangeEnd - rangeStart) < THAI_MIN_WORD) {
|
||||
return 0; // Not enough chacters for word
|
||||
return 0; // Not enough characters for word
|
||||
}
|
||||
CharacterIterator fIter = getText();
|
||||
int wordsFound = 0;
|
||||
int wordLength;
|
||||
int current;
|
||||
Stack<Integer> foundBreaks = new Stack<Integer>();
|
||||
PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
|
||||
for (int i = 0; i < THAI_LOOKAHEAD; i++) {
|
||||
words[i] = new PossibleWord();
|
||||
@ -228,7 +181,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
// If there was more than one, see which one can take use forward the most words
|
||||
// If there was more than one, see which one can take us forward the most words
|
||||
else if (candidates > 1) {
|
||||
boolean foundBest = false;
|
||||
// If we're already at the end of the range, we're done
|
||||
@ -259,9 +212,10 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
}
|
||||
} while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter) && !foundBest);
|
||||
}
|
||||
/* foundBest: */wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
|
||||
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
// We come here after having either found a word or not. We look ahead to the
|
||||
// next word. If it's not a dictionary word, we will combine it with the word we
|
||||
// just found (if there is one), but only if the preceding word does not exceed
|
||||
@ -291,8 +245,8 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
// two characters after uc were not 0x0E4C THANTHAKHAT before
|
||||
// checking the dictionary. That is just a performance filter,
|
||||
// but it's not clear it's faster than checking the trie
|
||||
int candidate = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
|
||||
fIter.setIndex(current+wordLength+chars);
|
||||
int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
|
||||
fIter.setIndex(current + wordLength + chars);
|
||||
if (candidate > 0) {
|
||||
break;
|
||||
}
|
||||
@ -300,7 +254,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
pc = uc;
|
||||
}
|
||||
|
||||
// Bump the word cound if there wasn't already one
|
||||
// Bump the word count if there wasn't already one
|
||||
if (wordLength <= 0) {
|
||||
wordsFound += 1;
|
||||
}
|
||||
@ -351,13 +305,13 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fIter.setIndex(current+wordLength);
|
||||
fIter.setIndex(current + wordLength);
|
||||
}
|
||||
}
|
||||
|
||||
// Did we find a word on this iteration? If so, push it on the break stack
|
||||
if (wordLength > 0) {
|
||||
foundBreaks.push(Integer.valueOf(current+wordLength));
|
||||
foundBreaks.push(Integer.valueOf(current + wordLength));
|
||||
}
|
||||
}
|
||||
|
||||
@ -367,16 +321,7 @@ class ThaiBreakIterator extends DictionaryBasedBreakIterator {
|
||||
wordsFound -= 1;
|
||||
}
|
||||
|
||||
// Store the break points in cachedBreakPositions.
|
||||
cachedBreakPositions = new int[foundBreaks.size() + 2];
|
||||
cachedBreakPositions[0] = rangeStart;
|
||||
int i;
|
||||
for (i = 0; i < foundBreaks.size(); i++) {
|
||||
cachedBreakPositions[i + 1] = foundBreaks.elementAt(i).intValue();
|
||||
}
|
||||
cachedBreakPositions[i + 1] = rangeEnd;
|
||||
positionInCache = 0;
|
||||
|
||||
return wordsFound;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.Stack;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
|
||||
import static com.ibm.icu.impl.CharacterIteration.*;
|
||||
|
||||
public final class UnhandledBreakEngine implements LanguageBreakEngine {
|
||||
// TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
|
||||
// in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
|
||||
private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
|
||||
public UnhandledBreakEngine() {
|
||||
for (int i = 0; i < fHandled.length; i++) {
|
||||
fHandled[i] = new UnicodeSet();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean handles(int c, int breakType) {
|
||||
return (breakType >= 0 && breakType < fHandled.length) &&
|
||||
(fHandled[breakType].contains(c));
|
||||
}
|
||||
|
||||
public int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
boolean reverse, int breakType, Stack<Integer> foundBreaks) {
|
||||
text.setIndex(endPos);
|
||||
return 0;
|
||||
}
|
||||
|
||||
public synchronized void handleChar(int c, int breakType) {
|
||||
if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
|
||||
if (!fHandled[breakType].contains(c)) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a243a8584459d751b33c922f2fbfaea27200721a1a27661b5fa2ec96bb5fc6e2
|
||||
size 7929565
|
||||
oid sha256:23641fd85dfa40f916a7a5b47a6dc8ebd591862a9fe2d62ddcd46b7f1a862d36
|
||||
size 9286396
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:fc6ebf5e136b448a03a7e74463c67d96217cc9f9d3feed4d2aa7f74dc5e25e63
|
||||
oid sha256:e951e7a3cc20e7126326db97e92ce533db611fde39c201795680246fde86c8e0
|
||||
size 97666
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2029b2752b52d544749fffea9b2574ddfd19ea278cf5f26243efd98bd3f15313
|
||||
size 719725
|
||||
oid sha256:54eeee6d7834231edb7d2d9bd3174d3c4347c737f556bc6b25915bb6860b6fe2
|
||||
size 719912
|
||||
|
@ -1,16 +1,11 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.StringCharacterIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -18,7 +13,6 @@ import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.DictionaryBasedBreakIterator;
|
||||
|
||||
public class BreakIteratorTest extends TestFmwk
|
||||
{
|
||||
@ -849,52 +843,4 @@ public class BreakIteratorTest extends TestFmwk
|
||||
errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Tests the constructors public DictionaryBasedBreakIterator(String rules, ... public
|
||||
* DictionaryBasedBreakIterator(InputStream compiledRules, ...
|
||||
*/
|
||||
public void TestDictionaryBasedBreakIterator() throws IOException {
|
||||
// The following class allows the testing of the constructor
|
||||
// public DictionaryBasedBreakIterator(String rules, ...
|
||||
class TestDictionaryBasedBreakIterator extends DictionaryBasedBreakIterator {
|
||||
public TestDictionaryBasedBreakIterator(InputStream is) throws IOException {
|
||||
super("", is);
|
||||
}
|
||||
}
|
||||
try {
|
||||
@SuppressWarnings("unused")
|
||||
TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(null);
|
||||
errln("DictionaryBasedBreakIterator constructor is suppose to return an "
|
||||
+ "exception for an empty string.");
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
File file = File.createTempFile("dummy", "");
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
DataInputStream dis = new DataInputStream(fis);
|
||||
@SuppressWarnings("unused")
|
||||
TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(dis);
|
||||
errln("DictionaryBasedBreakIterator constructor is suppose to return an "
|
||||
+ "exception for a temporary file with EOF.");
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
// The following class allows the testing of the constructor
|
||||
// public DictionaryBasedBreakIterator(InputStream compiledRules, ...
|
||||
class TestDictionaryBasedBreakIterator1 extends DictionaryBasedBreakIterator {
|
||||
public TestDictionaryBasedBreakIterator1() throws IOException {
|
||||
super((InputStream) null, (InputStream) null);
|
||||
}
|
||||
|
||||
}
|
||||
try {
|
||||
@SuppressWarnings("unused")
|
||||
TestDictionaryBasedBreakIterator1 td1 = new TestDictionaryBasedBreakIterator1();
|
||||
errln("DictionaryBasedBreakIterator constructor is suppose to return an "
|
||||
+ "exception for an null input stream.");
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2011, International Business Machines Corporation and
|
||||
* Copyright (C) 1996-2012, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -20,7 +20,6 @@ import java.util.List;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.DictionaryBasedBreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
@ -584,7 +583,7 @@ public class RBBITest extends TestFmwk {
|
||||
errln("Incorrect following position.");
|
||||
}
|
||||
int []fillInArray = new int[2];
|
||||
if (((DictionaryBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
|
||||
if (((RuleBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
|
||||
errln("Error: Since getRuleStatusVec is not supported in DictionaryBasedBreakIterator, it should return 1 and fillInArray[0] == 0.");
|
||||
}
|
||||
}
|
||||
@ -663,11 +662,6 @@ public class RBBITest extends TestFmwk {
|
||||
final String posxWordText = "Can't have breaks in xx:yy or struct.field for CS-types.";
|
||||
final int[] posxWordTOffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
|
||||
final int[] posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
|
||||
// KIND_WORD "ja"
|
||||
final String jaWordText = "\u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF" +
|
||||
"\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002";
|
||||
final int[] jaWordTOffsets = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 };
|
||||
final int[] jaWordROffsets = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
|
||||
// KIND_SENTENCE "el"
|
||||
final String elSentText = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " +
|
||||
"\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3";
|
||||
@ -688,8 +682,6 @@ public class RBBITest extends TestFmwk {
|
||||
final TBItem[] tests = {
|
||||
new TBItem( BreakIterator.KIND_WORD, new ULocale("en_US_POSIX"), posxWordText, posxWordTOffsets ),
|
||||
new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, posxWordText, posxWordROffsets ),
|
||||
new TBItem( BreakIterator.KIND_WORD, new ULocale("ja"), jaWordText, jaWordTOffsets ),
|
||||
new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, jaWordText, jaWordROffsets ),
|
||||
new TBItem( BreakIterator.KIND_SENTENCE, new ULocale("el"), elSentText, elSentTOffsets ),
|
||||
new TBItem( BreakIterator.KIND_SENTENCE, ULocale.ROOT, elSentText, elSentROffsets ),
|
||||
new TBItem( BreakIterator.KIND_CHARACTER, new ULocale("th"), thCharText, thCharTOffsets ),
|
||||
|
@ -51,7 +51,6 @@ static class TestParams {
|
||||
|
||||
|
||||
public void TestExtended() {
|
||||
|
||||
TestParams tp = new TestParams();
|
||||
|
||||
|
||||
@ -434,6 +433,7 @@ void executeTest(TestParams t) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Run the iterator backwards, verify that the same breaks are found.
|
||||
//
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2011 International Business Machines Corporation and
|
||||
* Copyright (C) 2003-2012 International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -264,15 +264,19 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
UnicodeSet fExtendSet;
|
||||
UnicodeSet fExtendNumLetSet;
|
||||
UnicodeSet fOtherSet;
|
||||
|
||||
UnicodeSet fDictionaryCjkSet;
|
||||
|
||||
|
||||
RBBIWordMonkey() {
|
||||
fCharProperty = UProperty.WORD_BREAK;
|
||||
|
||||
fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]");
|
||||
fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
|
||||
fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
|
||||
fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
|
||||
fALetterSet.removeAll(fDictionaryCjkSet);
|
||||
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
|
||||
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
|
||||
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
|
||||
@ -297,13 +301,14 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
fOtherSet.removeAll(fExtendNumLetSet);
|
||||
// Inhibit dictionary characters from being tested at all.
|
||||
fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
|
||||
fOtherSet.removeAll(fDictionaryCjkSet);
|
||||
|
||||
fSets = new ArrayList();
|
||||
fSets.add(fCRSet);
|
||||
fSets.add(fLFSet);
|
||||
fSets.add(fNewlineSet);
|
||||
fSets.add(fALetterSet);
|
||||
fSets.add(fKatakanaSet);
|
||||
//fSets.add(fKatakanaSet); // TODO: work out how to test katakana
|
||||
fSets.add(fMidLetterSet);
|
||||
fSets.add(fMidNumLetSet);
|
||||
fSets.add(fMidNumSet);
|
||||
@ -1484,7 +1489,6 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
/**
|
||||
* return the index of the next code point in the input text.
|
||||
* @param i the preceding index
|
||||
* @return
|
||||
*/
|
||||
static int nextCP(StringBuffer s, int i) {
|
||||
if (i == -1) {
|
||||
|
@ -1,19 +1,15 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2006, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ListResourceBundle;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.DictionaryBasedBreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
|
||||
// TODO: {dlf} this test currently doesn't test anything!
|
||||
@ -160,30 +156,12 @@ public class SimpleBITest extends TestFmwk{
|
||||
"Character", "Word", "Line", "Sentence"
|
||||
};
|
||||
String rulesName = kindNames[kind] + "BreakRules";
|
||||
String dictionaryName = kindNames[kind] + "BreakDictionary";
|
||||
|
||||
String[] classNames = bundle.getStringArray("BreakIteratorClasses");
|
||||
String rules = bundle.getString(rulesName);
|
||||
if (classNames[kind].equals("RuleBasedBreakIterator")) {
|
||||
iter = new RuleBasedBreakIterator(rules);
|
||||
}
|
||||
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
|
||||
try {
|
||||
String dictionaryPath = bundle.getString(dictionaryName);
|
||||
InputStream dictionary = bundle.getClass().getResourceAsStream(dictionaryPath);
|
||||
System.out.println("looking for " + dictionaryPath + " from " + bundle.getClass() + " returned " + dictionary);
|
||||
iter = new DictionaryBasedBreakIterator(rules, dictionary);
|
||||
}
|
||||
catch(IOException e) {
|
||||
e.printStackTrace();
|
||||
errln(e.getMessage());
|
||||
System.out.println(e); // debug
|
||||
}
|
||||
catch(MissingResourceException e) {
|
||||
errln(e.getMessage());
|
||||
System.out.println(e); // debug
|
||||
}
|
||||
}
|
||||
if (iter == null) {
|
||||
errln("could not create iterator");
|
||||
}
|
||||
|
@ -33,9 +33,8 @@
|
||||
|
||||
|
||||
# Temp debugging tests
|
||||
<locale en>
|
||||
<line>
|
||||
<data>•Hello, •World.•</data>
|
||||
<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
@ -171,7 +170,14 @@
|
||||
<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
|
||||
|
||||
# Hiragana & Katakana stay together, but separates from each other and Latin.
|
||||
<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
|
||||
# *** what to do about theoretical combos of chars? i.e. hiragana + accent
|
||||
#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#•</data>
|
||||
|
||||
# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
|
||||
<data>•芽キャベツ<400>芽キャベツ<400></data>
|
||||
|
||||
# Testing of word boundary for dictionary word containing both kanji and kana
|
||||
<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
|
||||
|
||||
# Words with interior formatting characters
|
||||
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
|
||||
@ -179,7 +185,6 @@
|
||||
# to test for bug #4097779
|
||||
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
|
||||
|
||||
|
||||
# to test for bug #4098467
|
||||
# What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
|
||||
@ -188,9 +193,14 @@
|
||||
# precomposed syllables...
|
||||
<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
|
||||
|
||||
<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
|
||||
# more Korean tests (Jamo not tested here, not counted as dictionary characters)
|
||||
# Disable them now because we don't include a Korean dictionary.
|
||||
#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
|
||||
#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
|
||||
#<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
|
||||
|
||||
<data>•\u06c9<200>\uc799<200>\ufffa•</data>
|
||||
|
||||
<data>•\u06c9\uc799\ufffa<200></data>
|
||||
|
||||
#
|
||||
# Try some words from other scripts.
|
||||
@ -507,8 +517,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
|
||||
|
||||
# conjoining jamo...
|
||||
# TODO: rules update needed
|
||||
#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
|
||||
<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
|
||||
|
||||
# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
|
||||
<data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
|
||||
@ -572,17 +581,17 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
# Test data originally from the test code source file
|
||||
# // @suwit -- Thai sample data from GVT Guideline
|
||||
#
|
||||
#<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
|
||||
#\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
|
||||
#\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
|
||||
#\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
|
||||
#
|
||||
## Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
|
||||
#<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
|
||||
#
|
||||
#<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
|
||||
#\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
|
||||
#\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
|
||||
<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
|
||||
\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
|
||||
\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
|
||||
\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
|
||||
|
||||
# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
|
||||
<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
|
||||
|
||||
<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
|
||||
\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
|
||||
\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
|
||||
|
||||
<line>
|
||||
<data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\
|
||||
@ -619,22 +628,22 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
# @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start
|
||||
#
|
||||
<line>
|
||||
#<data>•\u0E1B\u0E35•\
|
||||
#\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
|
||||
#2545 •\
|
||||
#\u0E40\u0E1B\u0E47\u0E19•\
|
||||
#\u0E1B\u0E35•\
|
||||
#\u0E09\u0E25\u0E2D\u0E07•\
|
||||
#\u0E04\u0E23\u0E1A•\
|
||||
#\u0E23\u0E2D\u0E1A •\
|
||||
#\"\u0E52\u0E52\u0E50 •\
|
||||
#\u0E1b\u0E35\" •\
|
||||
#\u0E02\u0E2d\u0E07•\
|
||||
#\u0E01\u0E23\u0E38\u0E07•\
|
||||
#\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\
|
||||
#(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\
|
||||
#\u0E2B\u0E23\u0E37\u0E2D •\
|
||||
#Bangkok)•</data>
|
||||
<data>•\u0E1B\u0E35•\
|
||||
\u0E1E\u0E38\u0E17\u0E18•\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
|
||||
2545 •\
|
||||
\u0E40\u0E1B\u0E47\u0E19•\
|
||||
\u0E1B\u0E35•\
|
||||
\u0E09\u0E25\u0E2D\u0E07•\
|
||||
\u0E04\u0E23\u0E1A•\
|
||||
\u0E23\u0E2D\u0E1A •\
|
||||
\"\u0E52\u0E52\u0E50 •\
|
||||
\u0E1b\u0E35\" •\
|
||||
\u0E02\u0E2d\u0E07•\
|
||||
\u0E01\u0E23\u0E38\u0E07•\
|
||||
\u0E23\u0E31\u0E15\u0E19•\u0E42\u0E01•\u0E2A\u0E34•\u0E19\u0E17\u0E23\u0E4C •\
|
||||
(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F\
|
||||
\u0E2B\u0E23\u0E37\u0E2D •\
|
||||
Bangkok)•</data>
|
||||
|
||||
# Data originally from RBBITest::TestMaiyamok()
|
||||
# The Thai maiyamok character is a shorthand symbol that means "repeat the previous
|
||||
@ -652,58 +661,6 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
\u0e22\u0e07•\
|
||||
\u0e43\u0e2b\u0e21\u0e48•</data>
|
||||
|
||||
|
||||
|
||||
##########################################################################################
|
||||
#
|
||||
# Khmer Tests
|
||||
#
|
||||
##########################################################################################
|
||||
|
||||
# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
|
||||
# from the file testdata/wordsegments.txt
|
||||
<locale th>
|
||||
<word>
|
||||
|
||||
#<data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data>
|
||||
#<data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data>
|
||||
#<data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data>
|
||||
##ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
|
||||
#<data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>អាច<200>ចូល<200></data>
|
||||
##ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
|
||||
#<data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់<200>ព្រះអង្គ<200></data>
|
||||
#<data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data>
|
||||
#<data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data>
|
||||
#<data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></data>
|
||||
#<data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data>
|
||||
#<data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data>
|
||||
#<data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></data>
|
||||
#<data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data>
|
||||
#<data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data>
|
||||
#<data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data>
|
||||
#<data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data>
|
||||
#<data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200></data>
|
||||
#<data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data>
|
||||
#<data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data>
|
||||
##អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data>
|
||||
#<data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data>
|
||||
#<data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data>
|
||||
#<data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data>
|
||||
|
||||
|
||||
#
|
||||
# Jitterbug 3671 Test Case
|
||||
#
|
||||
#<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>
|
||||
|
||||
#
|
||||
# Trac ticket 5595 Test Case
|
||||
#<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\
|
||||
#ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\
|
||||
#ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\
|
||||
#สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\
|
||||
#ไมล์<200></data>
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Tailored (locale specific) breaking.
|
||||
@ -714,7 +671,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
|
||||
<locale ja>
|
||||
<line>
|
||||
<data>•\u3041•\u3043•\u3045•\u31f1•</data>
|
||||
<data>•\u3041\u3043\u3045\u31f1•</data>
|
||||
<locale en>
|
||||
<line>
|
||||
<data>•\u3041\u3043\u3045\u31f1•</data>
|
||||
@ -722,19 +679,20 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
# The following data was originally in RBBITest::TestJapaneseWordBreak()
|
||||
<locale ja>
|
||||
<word>
|
||||
<data>•\u4ECA\u65E5<400>\u306F\u3044\u3044<300>\u5929\u6C17<400>\u3067\u3059\u306D<300>\u3002•\u000D\u000A•</data>
|
||||
<data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A•</data>
|
||||
|
||||
# UBreakIteratorType UBRK_WORD, Locale "ja"
|
||||
# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
|
||||
# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
|
||||
# modified to work with dbbi code - should verify
|
||||
|
||||
<locale ja>
|
||||
<word>
|
||||
<data>•私達<400>に<300>一〇〇〇<400>の<300>コンピュータ<300>がある<300>。<0>奈々<400>は<300>ワード<300>である<300>。•</data>
|
||||
<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>
|
||||
|
||||
<locale root>
|
||||
<word>
|
||||
<data>•私<400>達<400>に<300>一<400>〇<400>〇<400>〇<400>の<300>コンピュータ<300>が<300>あ<300>る<300>。<0>奈<400>々<200>は<300>ワード<300>で<300>あ<300>る<300>。•</data>
|
||||
<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>
|
||||
|
||||
# UBreakIteratorType UBRK_SENTENCE, Locale "el"
|
||||
# Add break after Greek question mark (cldrbug #2069).
|
||||
|
@ -474,18 +474,6 @@ public final class ICUResourceBundleTest extends TestFmwk {
|
||||
errln("Did not get the expected output for referencingalias");
|
||||
}
|
||||
}
|
||||
{
|
||||
rb = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
|
||||
sub = rb.get("boundaries");
|
||||
String word = sub.getString("word");
|
||||
|
||||
if(word.equals("word_ja.brk")){
|
||||
logln("Got the expected output for boundaries/word");
|
||||
}else{
|
||||
errln("Did not get the expected type for boundaries/word");
|
||||
}
|
||||
|
||||
}
|
||||
{
|
||||
UResourceBundle rb1 = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
|
||||
if(rb1!=rb){
|
||||
|
@ -104,23 +104,6 @@ public class ULocaleTest extends TestFmwk {
|
||||
}
|
||||
*/
|
||||
|
||||
public void TestBreakIterator() {
|
||||
checkService("ja_JP_OSAKA", new ServiceFacade() {
|
||||
public Object create(ULocale req) {
|
||||
return BreakIterator.getWordInstance(req);
|
||||
}
|
||||
}, null, new Registrar() {
|
||||
public Object register(ULocale loc, Object prototype) {
|
||||
return BreakIterator.registerInstance(
|
||||
(BreakIterator) prototype,
|
||||
loc, BreakIterator.KIND_WORD);
|
||||
}
|
||||
public boolean unregister(Object key) {
|
||||
return BreakIterator.unregister(key);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public void TestDateFormat() {
|
||||
checkService("de_CH_ZURICH", new ServiceFacade() {
|
||||
public Object create(ULocale req) {
|
||||
|
Loading…
Reference in New Issue
Block a user