Files
tubestation/intl/icu/source/common/brkeng.cpp
Alexandru Marc 52a93f69fa Backed out 19 changesets (bug 1927706) for causing reftest failures @ space-cluster-2.html
Backed out changeset 3877f1fa62f5 (bug 1927706)
Backed out changeset 233c6ebf84a2 (bug 1927706)
Backed out changeset 07e5871d5fa3 (bug 1927706)
Backed out changeset 84ef75087931 (bug 1927706)
Backed out changeset f89b916619e1 (bug 1927706)
Backed out changeset b82d9d622315 (bug 1927706)
Backed out changeset b0d2c5711865 (bug 1927706)
Backed out changeset 9529dda25bd9 (bug 1927706)
Backed out changeset 40b7907d7fc8 (bug 1927706)
Backed out changeset c549655dbd73 (bug 1927706)
Backed out changeset c5cc289771b3 (bug 1927706)
Backed out changeset 8ef66f7822c4 (bug 1927706)
Backed out changeset dff6d37fb2fe (bug 1927706)
Backed out changeset 083a0b3da643 (bug 1927706)
Backed out changeset 06649ac72a19 (bug 1927706)
Backed out changeset 019f7533abbc (bug 1927706)
Backed out changeset f1539604c459 (bug 1927706)
Backed out changeset 578667f1f0d4 (bug 1927706)
Backed out changeset 8ed1e7e7d4ab (bug 1927706)
2024-10-30 11:19:58 +02:00

305 lines
9.4 KiB
C++

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
************************************************************************************
* Copyright (C) 2006-2016, International Business Machines Corporation
* and others. All Rights Reserved.
************************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/ures.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ustring.h"
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "brkeng.h"
#include "cmemory.h"
#include "dictbe.h"
#include "lstmbe.h"
#include "charstr.h"
#include "dictionarydata.h"
#include "mutex.h"
#include "uvector.h"
#include "umutex.h"
#include "uresimp.h"
#include "ubrkimpl.h"
U_NAMESPACE_BEGIN
/*
******************************************************************
*/
LanguageBreakEngine::LanguageBreakEngine() {
}
LanguageBreakEngine::~LanguageBreakEngine() {
}
/*
******************************************************************
*/
LanguageBreakFactory::LanguageBreakFactory() {
}
LanguageBreakFactory::~LanguageBreakFactory() {
}
/*
******************************************************************
*/
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
(void)status;
}
UnhandledEngine::~UnhandledEngine() {
delete fHandled;
fHandled = nullptr;
}
UBool
UnhandledEngine::handles(UChar32 c) const {
return fHandled && fHandled->contains(c);
}
int32_t
UnhandledEngine::findBreaks( UText *text,
int32_t /* startPos */,
int32_t endPos,
UVector32 &/*foundBreaks*/,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
}
return 0;
}
void
UnhandledEngine::handleCharacter(UChar32 c) {
if (fHandled == nullptr) {
fHandled = new UnicodeSet();
if (fHandled == nullptr) {
return;
}
}
if (!fHandled->contains(c)) {
UErrorCode status = U_ZERO_ERROR;
// Apply the entire script of the character.
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
}
}
/*
******************************************************************
*/
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
fEngines = 0;
}
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
if (fEngines != 0) {
delete fEngines;
}
}
U_NAMESPACE_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteEngine(void *obj) {
delete (const icu::LanguageBreakEngine *) obj;
}
U_CDECL_END
U_NAMESPACE_BEGIN
const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}
fEngines = engines.orphan();
} else {
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c)) {
return lbe;
}
}
}
// We didn't find an engine. Create one.
lbe = loadEngineFor(c);
if (lbe != nullptr) {
fEngines->push((void *)lbe, status);
}
return U_SUCCESS(status) ? lbe : nullptr;
}
const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
const LanguageBreakEngine *engine = nullptr;
// Try to use LSTM first
const LSTMData *data = CreateLSTMDataForScript(code, status);
if (U_SUCCESS(status)) {
if (data != nullptr) {
engine = CreateLSTMBreakEngine(code, data, status);
if (U_SUCCESS(status) && engine != nullptr) {
return engine;
}
if (engine != nullptr) {
delete engine;
engine = nullptr;
} else {
DeleteLSTMData(data);
}
}
}
status = U_ZERO_ERROR; // fallback to dictionary based
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
if (m != nullptr) {
switch(code) {
case USCRIPT_THAI:
engine = new ThaiBreakEngine(m, status);
break;
case USCRIPT_LAO:
engine = new LaoBreakEngine(m, status);
break;
case USCRIPT_MYANMAR:
engine = new BurmeseBreakEngine(m, status);
break;
case USCRIPT_KHMER:
engine = new KhmerBreakEngine(m, status);
break;
#if !UCONFIG_NO_NORMALIZATION
// CJK not available w/o normalization
case USCRIPT_HANGUL:
engine = new CjkBreakEngine(m, kKorean, status);
break;
// use same BreakEngine and dictionary for both Chinese and Japanese
case USCRIPT_HIRAGANA:
case USCRIPT_KATAKANA:
case USCRIPT_HAN:
engine = new CjkBreakEngine(m, kChineseJapanese, status);
break;
#if 0
// TODO: Have to get some characters with script=common handled
// by CjkBreakEngine (e.g. U+309B). Simply subjecting
// them to CjkBreakEngine does not work. The engine has to
// special-case them.
case USCRIPT_COMMON:
{
UBlockCode block = ublock_getCode(code);
if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
engine = new CjkBreakEngine(dict, kChineseJapanese, status);
break;
}
#endif
#endif
default:
break;
}
if (engine == nullptr) {
delete m;
}
else if (U_FAILURE(status)) {
delete engine;
engine = nullptr;
}
return engine;
}
}
return nullptr;
}
DictionaryMatcher *
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
UErrorCode status = U_ZERO_ERROR;
// open root from brkitr tree.
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
int32_t dictnlength = 0;
const char16_t *dictfname =
ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
if (U_FAILURE(status)) {
ures_close(b);
return nullptr;
}
CharString dictnbuf;
CharString ext;
const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
if (extStart != nullptr) {
int32_t len = (int32_t)(extStart - dictfname);
ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
dictnlength = len;
}
dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
ures_close(b);
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
if (U_SUCCESS(status)) {
// build trie
const uint8_t *data = (const uint8_t *)udata_getMemory(file);
const int32_t *indexes = (const int32_t *)data;
const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
DictionaryMatcher *m = nullptr;
if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
const char *characters = (const char *)(data + offset);
m = new BytesDictionaryMatcher(characters, transform, file);
}
else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
const char16_t *characters = (const char16_t *)(data + offset);
m = new UCharsDictionaryMatcher(characters, file);
}
if (m == nullptr) {
// no matcher exists to take ownership - either we are an invalid
// type or memory allocation failed
udata_close(file);
}
return m;
} else if (dictfname != nullptr) {
// we don't have a dictionary matcher.
// returning nullptr here will cause us to fail to find a dictionary break engine, as expected
status = U_ZERO_ERROR;
return nullptr;
}
return nullptr;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */