Backed out changeset 3877f1fa62f5 (bug 1927706) Backed out changeset 233c6ebf84a2 (bug 1927706) Backed out changeset 07e5871d5fa3 (bug 1927706) Backed out changeset 84ef75087931 (bug 1927706) Backed out changeset f89b916619e1 (bug 1927706) Backed out changeset b82d9d622315 (bug 1927706) Backed out changeset b0d2c5711865 (bug 1927706) Backed out changeset 9529dda25bd9 (bug 1927706) Backed out changeset 40b7907d7fc8 (bug 1927706) Backed out changeset c549655dbd73 (bug 1927706) Backed out changeset c5cc289771b3 (bug 1927706) Backed out changeset 8ef66f7822c4 (bug 1927706) Backed out changeset dff6d37fb2fe (bug 1927706) Backed out changeset 083a0b3da643 (bug 1927706) Backed out changeset 06649ac72a19 (bug 1927706) Backed out changeset 019f7533abbc (bug 1927706) Backed out changeset f1539604c459 (bug 1927706) Backed out changeset 578667f1f0d4 (bug 1927706) Backed out changeset 8ed1e7e7d4ab (bug 1927706)
305 lines
9.4 KiB
C++
305 lines
9.4 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
************************************************************************************
|
|
* Copyright (C) 2006-2016, International Business Machines Corporation
|
|
* and others. All Rights Reserved.
|
|
************************************************************************************
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_BREAK_ITERATION
|
|
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/chariter.h"
|
|
#include "unicode/ures.h"
|
|
#include "unicode/udata.h"
|
|
#include "unicode/putil.h"
|
|
#include "unicode/ustring.h"
|
|
#include "unicode/uscript.h"
|
|
#include "unicode/ucharstrie.h"
|
|
#include "unicode/bytestrie.h"
|
|
|
|
#include "brkeng.h"
|
|
#include "cmemory.h"
|
|
#include "dictbe.h"
|
|
#include "lstmbe.h"
|
|
#include "charstr.h"
|
|
#include "dictionarydata.h"
|
|
#include "mutex.h"
|
|
#include "uvector.h"
|
|
#include "umutex.h"
|
|
#include "uresimp.h"
|
|
#include "ubrkimpl.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
/*
|
|
******************************************************************
|
|
*/
|
|
|
|
LanguageBreakEngine::LanguageBreakEngine() {
|
|
}
|
|
|
|
LanguageBreakEngine::~LanguageBreakEngine() {
|
|
}
|
|
|
|
/*
|
|
******************************************************************
|
|
*/
|
|
|
|
LanguageBreakFactory::LanguageBreakFactory() {
|
|
}
|
|
|
|
LanguageBreakFactory::~LanguageBreakFactory() {
|
|
}
|
|
|
|
/*
|
|
******************************************************************
|
|
*/
|
|
|
|
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
|
|
(void)status;
|
|
}
|
|
|
|
UnhandledEngine::~UnhandledEngine() {
|
|
delete fHandled;
|
|
fHandled = nullptr;
|
|
}
|
|
|
|
UBool
|
|
UnhandledEngine::handles(UChar32 c) const {
|
|
return fHandled && fHandled->contains(c);
|
|
}
|
|
|
|
int32_t
|
|
UnhandledEngine::findBreaks( UText *text,
|
|
int32_t /* startPos */,
|
|
int32_t endPos,
|
|
UVector32 &/*foundBreaks*/,
|
|
UBool /* isPhraseBreaking */,
|
|
UErrorCode &status) const {
|
|
if (U_FAILURE(status)) return 0;
|
|
UChar32 c = utext_current32(text);
|
|
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
|
|
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
|
c = utext_current32(text);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
UnhandledEngine::handleCharacter(UChar32 c) {
|
|
if (fHandled == nullptr) {
|
|
fHandled = new UnicodeSet();
|
|
if (fHandled == nullptr) {
|
|
return;
|
|
}
|
|
}
|
|
if (!fHandled->contains(c)) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
// Apply the entire script of the character.
|
|
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
|
|
fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
|
|
}
|
|
}
|
|
|
|
/*
|
|
******************************************************************
|
|
*/
|
|
|
|
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
|
|
fEngines = 0;
|
|
}
|
|
|
|
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
|
|
if (fEngines != 0) {
|
|
delete fEngines;
|
|
}
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
U_CDECL_BEGIN
|
|
static void U_CALLCONV _deleteEngine(void *obj) {
|
|
delete (const icu::LanguageBreakEngine *) obj;
|
|
}
|
|
U_CDECL_END
|
|
U_NAMESPACE_BEGIN
|
|
|
|
const LanguageBreakEngine *
|
|
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
|
|
const LanguageBreakEngine *lbe = nullptr;
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
static UMutex gBreakEngineMutex;
|
|
Mutex m(&gBreakEngineMutex);
|
|
|
|
if (fEngines == nullptr) {
|
|
LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
|
|
if (U_FAILURE(status) ) {
|
|
// Note: no way to return error code to caller.
|
|
return nullptr;
|
|
}
|
|
fEngines = engines.orphan();
|
|
} else {
|
|
int32_t i = fEngines->size();
|
|
while (--i >= 0) {
|
|
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
|
if (lbe != nullptr && lbe->handles(c)) {
|
|
return lbe;
|
|
}
|
|
}
|
|
}
|
|
|
|
// We didn't find an engine. Create one.
|
|
lbe = loadEngineFor(c);
|
|
if (lbe != nullptr) {
|
|
fEngines->push((void *)lbe, status);
|
|
}
|
|
return U_SUCCESS(status) ? lbe : nullptr;
|
|
}
|
|
|
|
const LanguageBreakEngine *
|
|
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UScriptCode code = uscript_getScript(c, &status);
|
|
if (U_SUCCESS(status)) {
|
|
const LanguageBreakEngine *engine = nullptr;
|
|
// Try to use LSTM first
|
|
const LSTMData *data = CreateLSTMDataForScript(code, status);
|
|
if (U_SUCCESS(status)) {
|
|
if (data != nullptr) {
|
|
engine = CreateLSTMBreakEngine(code, data, status);
|
|
if (U_SUCCESS(status) && engine != nullptr) {
|
|
return engine;
|
|
}
|
|
if (engine != nullptr) {
|
|
delete engine;
|
|
engine = nullptr;
|
|
} else {
|
|
DeleteLSTMData(data);
|
|
}
|
|
}
|
|
}
|
|
status = U_ZERO_ERROR; // fallback to dictionary based
|
|
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
|
|
if (m != nullptr) {
|
|
switch(code) {
|
|
case USCRIPT_THAI:
|
|
engine = new ThaiBreakEngine(m, status);
|
|
break;
|
|
case USCRIPT_LAO:
|
|
engine = new LaoBreakEngine(m, status);
|
|
break;
|
|
case USCRIPT_MYANMAR:
|
|
engine = new BurmeseBreakEngine(m, status);
|
|
break;
|
|
case USCRIPT_KHMER:
|
|
engine = new KhmerBreakEngine(m, status);
|
|
break;
|
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
// CJK not available w/o normalization
|
|
case USCRIPT_HANGUL:
|
|
engine = new CjkBreakEngine(m, kKorean, status);
|
|
break;
|
|
|
|
// use same BreakEngine and dictionary for both Chinese and Japanese
|
|
case USCRIPT_HIRAGANA:
|
|
case USCRIPT_KATAKANA:
|
|
case USCRIPT_HAN:
|
|
engine = new CjkBreakEngine(m, kChineseJapanese, status);
|
|
break;
|
|
#if 0
|
|
// TODO: Have to get some characters with script=common handled
|
|
// by CjkBreakEngine (e.g. U+309B). Simply subjecting
|
|
// them to CjkBreakEngine does not work. The engine has to
|
|
// special-case them.
|
|
case USCRIPT_COMMON:
|
|
{
|
|
UBlockCode block = ublock_getCode(code);
|
|
if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
|
|
engine = new CjkBreakEngine(dict, kChineseJapanese, status);
|
|
break;
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
default:
|
|
break;
|
|
}
|
|
if (engine == nullptr) {
|
|
delete m;
|
|
}
|
|
else if (U_FAILURE(status)) {
|
|
delete engine;
|
|
engine = nullptr;
|
|
}
|
|
return engine;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
DictionaryMatcher *
|
|
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
// open root from brkitr tree.
|
|
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
|
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
|
|
int32_t dictnlength = 0;
|
|
const char16_t *dictfname =
|
|
ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
|
|
if (U_FAILURE(status)) {
|
|
ures_close(b);
|
|
return nullptr;
|
|
}
|
|
CharString dictnbuf;
|
|
CharString ext;
|
|
const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
|
|
if (extStart != nullptr) {
|
|
int32_t len = (int32_t)(extStart - dictfname);
|
|
ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
|
|
dictnlength = len;
|
|
}
|
|
dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
|
|
ures_close(b);
|
|
|
|
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
|
|
if (U_SUCCESS(status)) {
|
|
// build trie
|
|
const uint8_t *data = (const uint8_t *)udata_getMemory(file);
|
|
const int32_t *indexes = (const int32_t *)data;
|
|
const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
|
|
const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
|
|
DictionaryMatcher *m = nullptr;
|
|
if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
|
|
const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
|
|
const char *characters = (const char *)(data + offset);
|
|
m = new BytesDictionaryMatcher(characters, transform, file);
|
|
}
|
|
else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
|
|
const char16_t *characters = (const char16_t *)(data + offset);
|
|
m = new UCharsDictionaryMatcher(characters, file);
|
|
}
|
|
if (m == nullptr) {
|
|
// no matcher exists to take ownership - either we are an invalid
|
|
// type or memory allocation failed
|
|
udata_close(file);
|
|
}
|
|
return m;
|
|
} else if (dictfname != nullptr) {
|
|
// we don't have a dictionary matcher.
|
|
// returning nullptr here will cause us to fail to find a dictionary break engine, as expected
|
|
status = U_ZERO_ERROR;
|
|
return nullptr;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|