2006-03-23 00:54:12 +00:00
|
|
|
/**
|
2007-05-02 23:23:31 +00:00
|
|
|
************************************************************************************
|
2011-05-04 13:25:37 +00:00
|
|
|
* Copyright (C) 2006-2009,2011, International Business Machines Corporation *
|
|
|
|
* and others. All Rights Reserved. *
|
2007-05-02 23:23:31 +00:00
|
|
|
************************************************************************************
|
2006-03-23 00:54:12 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_BREAK_ITERATION
|
|
|
|
|
|
|
|
#include "brkeng.h"
|
|
|
|
#include "dictbe.h"
|
|
|
|
#include "triedict.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/chariter.h"
|
|
|
|
#include "unicode/ures.h"
|
|
|
|
#include "unicode/udata.h"
|
|
|
|
#include "unicode/putil.h"
|
2006-04-21 00:30:22 +00:00
|
|
|
#include "unicode/ustring.h"
|
2007-05-02 23:07:12 +00:00
|
|
|
#include "unicode/uscript.h"
|
2006-03-23 00:54:12 +00:00
|
|
|
#include "uvector.h"
|
2006-09-08 03:35:35 +00:00
|
|
|
#include "umutex.h"
|
2006-03-23 00:54:12 +00:00
|
|
|
#include "uresimp.h"
|
|
|
|
#include "ubrkimpl.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
/*
|
|
|
|
******************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
LanguageBreakEngine::LanguageBreakEngine() {
|
|
|
|
}
|
|
|
|
|
|
|
|
LanguageBreakEngine::~LanguageBreakEngine() {
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
******************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
LanguageBreakFactory::LanguageBreakFactory() {
|
|
|
|
}
|
|
|
|
|
|
|
|
LanguageBreakFactory::~LanguageBreakFactory() {
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
******************************************************************
|
|
|
|
*/
|
|
|
|
|
2006-03-23 20:48:47 +00:00
|
|
|
UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
|
2006-03-23 02:54:34 +00:00
|
|
|
for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
|
2006-03-23 00:54:12 +00:00
|
|
|
fHandled[i] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UnhandledEngine::~UnhandledEngine() {
|
2006-03-23 02:54:34 +00:00
|
|
|
for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
|
2006-03-23 00:54:12 +00:00
|
|
|
if (fHandled[i] != 0) {
|
|
|
|
delete fHandled[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool
|
|
|
|
UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
|
2006-03-23 02:54:34 +00:00
|
|
|
return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
|
2006-03-23 00:54:12 +00:00
|
|
|
&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
2006-04-22 05:29:27 +00:00
|
|
|
UnhandledEngine::findBreaks( UText *text,
|
2006-03-23 00:54:12 +00:00
|
|
|
int32_t startPos,
|
|
|
|
int32_t endPos,
|
|
|
|
UBool reverse,
|
|
|
|
int32_t breakType,
|
2006-03-23 20:48:47 +00:00
|
|
|
UStack &/*foundBreaks*/ ) const {
|
2006-03-23 02:54:34 +00:00
|
|
|
if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
|
2006-04-22 05:29:27 +00:00
|
|
|
UChar32 c = utext_current32(text);
|
2006-03-23 00:54:12 +00:00
|
|
|
if (reverse) {
|
2006-04-22 05:29:27 +00:00
|
|
|
while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
|
|
|
|
c = utext_previous32(text);
|
2006-03-23 00:54:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
2006-04-22 05:29:27 +00:00
|
|
|
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
|
|
|
|
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
|
|
|
c = utext_current32(text);
|
2006-03-23 00:54:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
|
2006-03-23 02:54:34 +00:00
|
|
|
if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
|
2006-03-23 00:54:12 +00:00
|
|
|
if (fHandled[breakType] == 0) {
|
|
|
|
fHandled[breakType] = new UnicodeSet();
|
|
|
|
if (fHandled[breakType] == 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!fHandled[breakType]->contains(c)) {
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
// Apply the entire script of the character.
|
|
|
|
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
|
|
|
|
fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
******************************************************************
|
|
|
|
*/
|
|
|
|
|
2006-03-23 20:48:47 +00:00
|
|
|
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
|
2006-03-23 00:54:12 +00:00
|
|
|
fEngines = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
|
|
|
|
if (fEngines != 0) {
|
|
|
|
delete fEngines;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
U_CDECL_BEGIN
|
|
|
|
static void U_CALLCONV _deleteEngine(void *obj) {
|
2006-09-03 17:08:23 +00:00
|
|
|
delete (const U_NAMESPACE_QUALIFIER LanguageBreakEngine *) obj;
|
2006-03-23 00:54:12 +00:00
|
|
|
}
|
|
|
|
U_CDECL_END
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
const LanguageBreakEngine *
|
|
|
|
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
|
|
|
|
UBool needsInit;
|
2007-05-02 23:07:12 +00:00
|
|
|
int32_t i;
|
|
|
|
const LanguageBreakEngine *lbe = NULL;
|
2006-03-23 00:54:12 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
2007-05-02 23:07:12 +00:00
|
|
|
|
2007-09-12 03:53:13 +00:00
|
|
|
// TODO: The global mutex should not be used.
|
|
|
|
// The global mutex should only be used for short periods.
|
|
|
|
// A ICULanguageBreakFactory specific mutex should be used.
|
2007-05-02 23:07:12 +00:00
|
|
|
umtx_lock(NULL);
|
|
|
|
needsInit = (UBool)(fEngines == NULL);
|
|
|
|
if (!needsInit) {
|
|
|
|
i = fEngines->size();
|
|
|
|
while (--i >= 0) {
|
|
|
|
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
|
|
|
if (lbe != NULL && lbe->handles(c, breakType)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
lbe = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
umtx_unlock(NULL);
|
|
|
|
|
|
|
|
if (lbe != NULL) {
|
|
|
|
return lbe;
|
|
|
|
}
|
2006-03-23 00:54:12 +00:00
|
|
|
|
|
|
|
if (needsInit) {
|
|
|
|
UStack *engines = new UStack(_deleteEngine, NULL, status);
|
|
|
|
if (U_SUCCESS(status) && engines == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
}
|
2007-05-02 23:07:12 +00:00
|
|
|
else if (U_FAILURE(status)) {
|
|
|
|
delete engines;
|
|
|
|
engines = NULL;
|
2006-03-23 00:54:12 +00:00
|
|
|
}
|
2007-05-02 23:07:12 +00:00
|
|
|
else {
|
|
|
|
umtx_lock(NULL);
|
|
|
|
if (fEngines == NULL) {
|
|
|
|
fEngines = engines;
|
|
|
|
engines = NULL;
|
2006-03-23 00:54:12 +00:00
|
|
|
}
|
2007-05-02 23:07:12 +00:00
|
|
|
umtx_unlock(NULL);
|
|
|
|
delete engines;
|
2006-03-23 00:54:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fEngines == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2007-05-02 23:07:12 +00:00
|
|
|
|
|
|
|
// We didn't find an engine the first time through, or there was no
|
|
|
|
// stack. Create an engine.
|
|
|
|
const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
|
|
|
|
|
|
|
|
// Now get the lock, and see if someone else has created it in the
|
|
|
|
// meantime
|
|
|
|
umtx_lock(NULL);
|
|
|
|
i = fEngines->size();
|
2006-03-23 00:54:12 +00:00
|
|
|
while (--i >= 0) {
|
|
|
|
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
|
|
|
if (lbe != NULL && lbe->handles(c, breakType)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
lbe = NULL;
|
|
|
|
}
|
2007-05-02 23:07:12 +00:00
|
|
|
if (lbe == NULL && newlbe != NULL) {
|
|
|
|
fEngines->push((void *)newlbe, status);
|
|
|
|
lbe = newlbe;
|
|
|
|
newlbe = NULL;
|
|
|
|
}
|
|
|
|
umtx_unlock(NULL);
|
|
|
|
|
|
|
|
delete newlbe;
|
|
|
|
|
2006-03-23 00:54:12 +00:00
|
|
|
return lbe;
|
|
|
|
}
|
|
|
|
|
2007-05-02 23:07:12 +00:00
|
|
|
const LanguageBreakEngine *
|
|
|
|
ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
UScriptCode code = uscript_getScript(c, &status);
|
|
|
|
if (U_SUCCESS(status)) {
|
|
|
|
const CompactTrieDictionary *dict = loadDictionaryFor(code, breakType);
|
|
|
|
if (dict != NULL) {
|
|
|
|
const LanguageBreakEngine *engine = NULL;
|
|
|
|
switch(code) {
|
|
|
|
case USCRIPT_THAI:
|
|
|
|
engine = new ThaiBreakEngine(dict, status);
|
|
|
|
break;
|
2011-05-04 13:25:37 +00:00
|
|
|
case USCRIPT_KHMER:
|
|
|
|
engine = new KhmerBreakEngine(dict, status);
|
|
|
|
break;
|
2007-05-02 23:07:12 +00:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (engine == NULL) {
|
|
|
|
delete dict;
|
|
|
|
}
|
|
|
|
else if (U_FAILURE(status)) {
|
|
|
|
delete engine;
|
|
|
|
engine = NULL;
|
|
|
|
}
|
|
|
|
return engine;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
const CompactTrieDictionary *
|
2007-06-01 07:21:01 +00:00
|
|
|
ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) {
|
2007-05-02 23:07:12 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
// Open root from brkitr tree.
|
|
|
|
char dictnbuff[256];
|
|
|
|
char ext[4]={'\0'};
|
|
|
|
|
|
|
|
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
|
|
|
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
|
|
|
|
b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status);
|
|
|
|
int32_t dictnlength = 0;
|
|
|
|
const UChar *dictfname = ures_getString(b, &dictnlength, &status);
|
|
|
|
if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) {
|
|
|
|
dictnlength = 0;
|
|
|
|
status = U_BUFFER_OVERFLOW_ERROR;
|
|
|
|
}
|
|
|
|
if (U_SUCCESS(status) && dictfname) {
|
|
|
|
UChar* extStart=u_strchr(dictfname, 0x002e);
|
|
|
|
int len = 0;
|
|
|
|
if(extStart!=NULL){
|
2009-11-11 15:47:22 +00:00
|
|
|
len = (int)(extStart-dictfname);
|
2007-05-02 23:07:12 +00:00
|
|
|
u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
|
|
|
|
u_UCharsToChars(dictfname, dictnbuff, len);
|
|
|
|
}
|
|
|
|
dictnbuff[len]=0; // nul terminate
|
|
|
|
}
|
|
|
|
ures_close(b);
|
|
|
|
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status);
|
|
|
|
if (U_SUCCESS(status)) {
|
|
|
|
const CompactTrieDictionary *dict = new CompactTrieDictionary(
|
|
|
|
file, status);
|
|
|
|
if (U_SUCCESS(status) && dict == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
}
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
delete dict;
|
|
|
|
dict = NULL;
|
|
|
|
}
|
|
|
|
return dict;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2006-03-23 00:54:12 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|