Files
tubestation/intl/icu/source/common/characterproperties.cpp
Alexandru Marc 52a93f69fa Backed out 19 changesets (bug 1927706) for causing reftest failures @ space-cluster-2.html
Backed out changeset 3877f1fa62f5 (bug 1927706)
Backed out changeset 233c6ebf84a2 (bug 1927706)
Backed out changeset 07e5871d5fa3 (bug 1927706)
Backed out changeset 84ef75087931 (bug 1927706)
Backed out changeset f89b916619e1 (bug 1927706)
Backed out changeset b82d9d622315 (bug 1927706)
Backed out changeset b0d2c5711865 (bug 1927706)
Backed out changeset 9529dda25bd9 (bug 1927706)
Backed out changeset 40b7907d7fc8 (bug 1927706)
Backed out changeset c549655dbd73 (bug 1927706)
Backed out changeset c5cc289771b3 (bug 1927706)
Backed out changeset 8ef66f7822c4 (bug 1927706)
Backed out changeset dff6d37fb2fe (bug 1927706)
Backed out changeset 083a0b3da643 (bug 1927706)
Backed out changeset 06649ac72a19 (bug 1927706)
Backed out changeset 019f7533abbc (bug 1927706)
Backed out changeset f1539604c459 (bug 1927706)
Backed out changeset 578667f1f0d4 (bug 1927706)
Backed out changeset 8ed1e7e7d4ab (bug 1927706)
2024-10-30 11:19:58 +02:00

420 lines
13 KiB
C++

// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// characterproperties.cpp
// created: 2018sep03 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/localpointer.h"
#include "unicode/uchar.h"
#include "unicode/ucpmap.h"
#include "unicode/ucptrie.h"
#include "unicode/umutablecptrie.h"
#include "unicode/uniset.h"
#include "unicode/uscript.h"
#include "unicode/uset.h"
#include "cmemory.h"
#include "emojiprops.h"
#include "mutex.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "ubidi_props.h"
#include "ucase.h"
#include "ucln_cmn.h"
#include "umutex.h"
#include "uprops.h"
using icu::LocalPointer;
#if !UCONFIG_NO_NORMALIZATION
using icu::Normalizer2Factory;
using icu::Normalizer2Impl;
#endif
using icu::UInitOnce;
using icu::UnicodeSet;
namespace {
UBool U_CALLCONV characterproperties_cleanup();
constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
struct Inclusion {
UnicodeSet *fSet = nullptr;
UInitOnce fInitOnce {};
};
Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
icu::UMutex cpMutex;
//----------------------------------------------------------------
// Inclusions list
//----------------------------------------------------------------
// USetAdder implementation
// Does not use uset.h to reduce code dependencies
void U_CALLCONV
_set_add(USet *set, UChar32 c) {
((UnicodeSet *)set)->add(c);
}
void U_CALLCONV
_set_addRange(USet *set, UChar32 start, UChar32 end) {
((UnicodeSet *)set)->add(start, end);
}
void U_CALLCONV
_set_addString(USet *set, const char16_t *str, int32_t length) {
((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
}
UBool U_CALLCONV characterproperties_cleanup() {
for (Inclusion &in: gInclusions) {
delete in.fSet;
in.fSet = nullptr;
in.fInitOnce.reset();
}
for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
delete sets[i];
sets[i] = nullptr;
}
for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
maps[i] = nullptr;
}
return true;
}
void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
if (src == UPROPS_SRC_NONE) {
errorCode = U_INTERNAL_PROGRAM_ERROR;
return;
}
U_ASSERT(gInclusions[src].fSet == nullptr);
LocalPointer<UnicodeSet> incl(new UnicodeSet());
if (incl.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
USetAdder sa = {
(USet *)incl.getAlias(),
_set_add,
_set_addRange,
_set_addString,
nullptr, // don't need remove()
nullptr // don't need removeRange()
};
switch(src) {
case UPROPS_SRC_CHAR:
uchar_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_PROPSVEC:
upropsvec_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_CHAR_AND_PROPSVEC:
uchar_addPropertyStarts(&sa, &errorCode);
upropsvec_addPropertyStarts(&sa, &errorCode);
break;
#if !UCONFIG_NO_NORMALIZATION
case UPROPS_SRC_CASE_AND_NORM: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
ucase_addPropertyStarts(&sa, &errorCode);
break;
}
case UPROPS_SRC_NFC: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
break;
}
case UPROPS_SRC_NFKC: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
break;
}
case UPROPS_SRC_NFKC_CF: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
break;
}
case UPROPS_SRC_NFC_CANON_ITER: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addCanonIterPropertyStarts(&sa, errorCode);
}
break;
}
#endif
case UPROPS_SRC_CASE:
ucase_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_BIDI:
ubidi_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_INPC:
case UPROPS_SRC_INSC:
case UPROPS_SRC_VO:
uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
break;
case UPROPS_SRC_EMOJI: {
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
if (U_SUCCESS(errorCode)) {
ep->addPropertyStarts(&sa, errorCode);
}
break;
}
default:
errorCode = U_INTERNAL_PROGRAM_ERROR;
break;
}
if (U_FAILURE(errorCode)) {
return;
}
if (incl->isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Compact for caching.
incl->compact();
gInclusions[src].fSet = incl.orphan();
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
}
const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
if (src < 0 || UPROPS_SRC_COUNT <= src) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Inclusion &i = gInclusions[src];
umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
return i.fSet;
}
void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
UPropertySource src = uprops_getSource(prop);
const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
if (U_FAILURE(errorCode)) {
return;
}
LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
if (intPropIncl.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t numRanges = incl->getRangeCount();
int32_t prevValue = 0;
for (int32_t i = 0; i < numRanges; ++i) {
UChar32 rangeEnd = incl->getRangeEnd(i);
for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
int32_t value = u_getIntPropertyValue(c, prop);
if (value != prevValue) {
intPropIncl->add(c);
prevValue = value;
}
}
}
if (intPropIncl->isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Compact for caching.
intPropIncl->compact();
gInclusions[inclIndex].fSet = intPropIncl.orphan();
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
}
} // namespace
U_NAMESPACE_BEGIN
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
UProperty prop, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
Inclusion &i = gInclusions[inclIndex];
umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
return i.fSet;
} else {
UPropertySource src = uprops_getSource(prop);
return getInclusionsForSource(src, errorCode);
}
}
U_NAMESPACE_END
namespace {
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
LocalPointer<UnicodeSet> set(new UnicodeSet());
if (set.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
// property of strings
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
USetAdder sa = {
(USet *)set.getAlias(),
_set_add,
_set_addRange,
_set_addString,
nullptr, // don't need remove()
nullptr // don't need removeRange()
};
ep->addStrings(&sa, property, errorCode);
if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
// property of _only_ strings
set->freeze();
return set.orphan();
}
}
const UnicodeSet *inclusions =
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
int32_t numRanges = inclusions->getRangeCount();
UChar32 startHasProperty = -1;
for (int32_t i = 0; i < numRanges; ++i) {
UChar32 rangeEnd = inclusions->getRangeEnd(i);
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
// TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
if (u_hasBinaryProperty(c, property)) {
if (startHasProperty < 0) {
// Transition from false to true.
startHasProperty = c;
}
} else if (startHasProperty >= 0) {
// Transition from true to false.
set->add(startHasProperty, c - 1);
startHasProperty = -1;
}
}
}
if (startHasProperty >= 0) {
set->add(startHasProperty, 0x10FFFF);
}
set->freeze();
return set.orphan();
}
UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
icu::LocalUMutableCPTriePointer mutableTrie(
umutablecptrie_open(nullValue, nullValue, &errorCode));
const UnicodeSet *inclusions =
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
int32_t numRanges = inclusions->getRangeCount();
UChar32 start = 0;
uint32_t value = nullValue;
for (int32_t i = 0; i < numRanges; ++i) {
UChar32 rangeEnd = inclusions->getRangeEnd(i);
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
uint32_t nextValue = u_getIntPropertyValue(c, property);
if (value != nextValue) {
if (value != nullValue) {
umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
}
start = c;
value = nextValue;
}
}
}
if (value != 0) {
umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
}
UCPTrieType type;
if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
type = UCPTRIE_TYPE_FAST;
} else {
type = UCPTRIE_TYPE_SMALL;
}
UCPTrieValueWidth valueWidth;
// TODO: UCharacterProperty.IntProperty
int32_t max = u_getIntPropertyMaxValue(property);
if (max <= 0xff) {
valueWidth = UCPTRIE_VALUE_BITS_8;
} else if (max <= 0xffff) {
valueWidth = UCPTRIE_VALUE_BITS_16;
} else {
valueWidth = UCPTRIE_VALUE_BITS_32;
}
return reinterpret_cast<UCPMap *>(
umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
}
} // namespace
U_NAMESPACE_BEGIN
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Mutex m(&cpMutex);
UnicodeSet *set = sets[property];
if (set == nullptr) {
sets[property] = set = makeSet(property, errorCode);
}
return set;
}
U_NAMESPACE_END
U_NAMESPACE_USE
U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
}
U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
if (U_FAILURE(*pErrorCode)) { return nullptr; }
if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Mutex m(&cpMutex);
UCPMap *map = maps[property - UCHAR_INT_START];
if (map == nullptr) {
maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
}
return map;
}