Backed out changeset 3877f1fa62f5 (bug 1927706) Backed out changeset 233c6ebf84a2 (bug 1927706) Backed out changeset 07e5871d5fa3 (bug 1927706) Backed out changeset 84ef75087931 (bug 1927706) Backed out changeset f89b916619e1 (bug 1927706) Backed out changeset b82d9d622315 (bug 1927706) Backed out changeset b0d2c5711865 (bug 1927706) Backed out changeset 9529dda25bd9 (bug 1927706) Backed out changeset 40b7907d7fc8 (bug 1927706) Backed out changeset c549655dbd73 (bug 1927706) Backed out changeset c5cc289771b3 (bug 1927706) Backed out changeset 8ef66f7822c4 (bug 1927706) Backed out changeset dff6d37fb2fe (bug 1927706) Backed out changeset 083a0b3da643 (bug 1927706) Backed out changeset 06649ac72a19 (bug 1927706) Backed out changeset 019f7533abbc (bug 1927706) Backed out changeset f1539604c459 (bug 1927706) Backed out changeset 578667f1f0d4 (bug 1927706) Backed out changeset 8ed1e7e7d4ab (bug 1927706)
420 lines
13 KiB
C++
420 lines
13 KiB
C++
// © 2018 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
// characterproperties.cpp
|
|
// created: 2018sep03 Markus W. Scherer
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/localpointer.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/ucpmap.h"
|
|
#include "unicode/ucptrie.h"
|
|
#include "unicode/umutablecptrie.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/uscript.h"
|
|
#include "unicode/uset.h"
|
|
#include "cmemory.h"
|
|
#include "emojiprops.h"
|
|
#include "mutex.h"
|
|
#include "normalizer2impl.h"
|
|
#include "uassert.h"
|
|
#include "ubidi_props.h"
|
|
#include "ucase.h"
|
|
#include "ucln_cmn.h"
|
|
#include "umutex.h"
|
|
#include "uprops.h"
|
|
|
|
using icu::LocalPointer;
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
using icu::Normalizer2Factory;
|
|
using icu::Normalizer2Impl;
|
|
#endif
|
|
using icu::UInitOnce;
|
|
using icu::UnicodeSet;
|
|
|
|
namespace {
|
|
|
|
UBool U_CALLCONV characterproperties_cleanup();
|
|
|
|
constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
|
|
|
|
struct Inclusion {
|
|
UnicodeSet *fSet = nullptr;
|
|
UInitOnce fInitOnce {};
|
|
};
|
|
Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
|
|
|
|
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
|
|
|
|
UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
|
|
|
|
icu::UMutex cpMutex;
|
|
|
|
//----------------------------------------------------------------
|
|
// Inclusions list
|
|
//----------------------------------------------------------------
|
|
|
|
// USetAdder implementation
|
|
// Does not use uset.h to reduce code dependencies
|
|
void U_CALLCONV
|
|
_set_add(USet *set, UChar32 c) {
|
|
((UnicodeSet *)set)->add(c);
|
|
}
|
|
|
|
void U_CALLCONV
|
|
_set_addRange(USet *set, UChar32 start, UChar32 end) {
|
|
((UnicodeSet *)set)->add(start, end);
|
|
}
|
|
|
|
void U_CALLCONV
|
|
_set_addString(USet *set, const char16_t *str, int32_t length) {
|
|
((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
|
|
}
|
|
|
|
UBool U_CALLCONV characterproperties_cleanup() {
|
|
for (Inclusion &in: gInclusions) {
|
|
delete in.fSet;
|
|
in.fSet = nullptr;
|
|
in.fInitOnce.reset();
|
|
}
|
|
for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
|
|
delete sets[i];
|
|
sets[i] = nullptr;
|
|
}
|
|
for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
|
|
ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
|
|
maps[i] = nullptr;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
|
|
// This function is invoked only via umtx_initOnce().
|
|
U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
|
|
if (src == UPROPS_SRC_NONE) {
|
|
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
|
return;
|
|
}
|
|
U_ASSERT(gInclusions[src].fSet == nullptr);
|
|
|
|
LocalPointer<UnicodeSet> incl(new UnicodeSet());
|
|
if (incl.isNull()) {
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
USetAdder sa = {
|
|
(USet *)incl.getAlias(),
|
|
_set_add,
|
|
_set_addRange,
|
|
_set_addString,
|
|
nullptr, // don't need remove()
|
|
nullptr // don't need removeRange()
|
|
};
|
|
|
|
switch(src) {
|
|
case UPROPS_SRC_CHAR:
|
|
uchar_addPropertyStarts(&sa, &errorCode);
|
|
break;
|
|
case UPROPS_SRC_PROPSVEC:
|
|
upropsvec_addPropertyStarts(&sa, &errorCode);
|
|
break;
|
|
case UPROPS_SRC_CHAR_AND_PROPSVEC:
|
|
uchar_addPropertyStarts(&sa, &errorCode);
|
|
upropsvec_addPropertyStarts(&sa, &errorCode);
|
|
break;
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
case UPROPS_SRC_CASE_AND_NORM: {
|
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
|
if(U_SUCCESS(errorCode)) {
|
|
impl->addPropertyStarts(&sa, errorCode);
|
|
}
|
|
ucase_addPropertyStarts(&sa, &errorCode);
|
|
break;
|
|
}
|
|
case UPROPS_SRC_NFC: {
|
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
|
if(U_SUCCESS(errorCode)) {
|
|
impl->addPropertyStarts(&sa, errorCode);
|
|
}
|
|
break;
|
|
}
|
|
case UPROPS_SRC_NFKC: {
|
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
|
|
if(U_SUCCESS(errorCode)) {
|
|
impl->addPropertyStarts(&sa, errorCode);
|
|
}
|
|
break;
|
|
}
|
|
case UPROPS_SRC_NFKC_CF: {
|
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
|
|
if(U_SUCCESS(errorCode)) {
|
|
impl->addPropertyStarts(&sa, errorCode);
|
|
}
|
|
break;
|
|
}
|
|
case UPROPS_SRC_NFC_CANON_ITER: {
|
|
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
|
if(U_SUCCESS(errorCode)) {
|
|
impl->addCanonIterPropertyStarts(&sa, errorCode);
|
|
}
|
|
break;
|
|
}
|
|
#endif
|
|
case UPROPS_SRC_CASE:
|
|
ucase_addPropertyStarts(&sa, &errorCode);
|
|
break;
|
|
case UPROPS_SRC_BIDI:
|
|
ubidi_addPropertyStarts(&sa, &errorCode);
|
|
break;
|
|
case UPROPS_SRC_INPC:
|
|
case UPROPS_SRC_INSC:
|
|
case UPROPS_SRC_VO:
|
|
uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
|
|
break;
|
|
case UPROPS_SRC_EMOJI: {
|
|
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
|
if (U_SUCCESS(errorCode)) {
|
|
ep->addPropertyStarts(&sa, errorCode);
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
|
break;
|
|
}
|
|
|
|
if (U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
if (incl->isBogus()) {
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
// Compact for caching.
|
|
incl->compact();
|
|
gInclusions[src].fSet = incl.orphan();
|
|
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
|
|
}
|
|
|
|
const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
if (src < 0 || UPROPS_SRC_COUNT <= src) {
|
|
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return nullptr;
|
|
}
|
|
Inclusion &i = gInclusions[src];
|
|
umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
|
|
return i.fSet;
|
|
}
|
|
|
|
void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
|
|
// This function is invoked only via umtx_initOnce().
|
|
U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
|
|
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
|
|
U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
|
|
UPropertySource src = uprops_getSource(prop);
|
|
const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
|
|
if (U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
|
|
LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
|
|
if (intPropIncl.isNull()) {
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
int32_t numRanges = incl->getRangeCount();
|
|
int32_t prevValue = 0;
|
|
for (int32_t i = 0; i < numRanges; ++i) {
|
|
UChar32 rangeEnd = incl->getRangeEnd(i);
|
|
for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
|
|
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
|
|
int32_t value = u_getIntPropertyValue(c, prop);
|
|
if (value != prevValue) {
|
|
intPropIncl->add(c);
|
|
prevValue = value;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (intPropIncl->isBogus()) {
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
// Compact for caching.
|
|
intPropIncl->compact();
|
|
gInclusions[inclIndex].fSet = intPropIncl.orphan();
|
|
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
|
|
UProperty prop, UErrorCode &errorCode) {
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
|
|
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
|
|
Inclusion &i = gInclusions[inclIndex];
|
|
umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
|
|
return i.fSet;
|
|
} else {
|
|
UPropertySource src = uprops_getSource(prop);
|
|
return getInclusionsForSource(src, errorCode);
|
|
}
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|
|
namespace {
|
|
|
|
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
LocalPointer<UnicodeSet> set(new UnicodeSet());
|
|
if (set.isNull()) {
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
return nullptr;
|
|
}
|
|
if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
|
|
// property of strings
|
|
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
USetAdder sa = {
|
|
(USet *)set.getAlias(),
|
|
_set_add,
|
|
_set_addRange,
|
|
_set_addString,
|
|
nullptr, // don't need remove()
|
|
nullptr // don't need removeRange()
|
|
};
|
|
ep->addStrings(&sa, property, errorCode);
|
|
if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
|
|
// property of _only_ strings
|
|
set->freeze();
|
|
return set.orphan();
|
|
}
|
|
}
|
|
|
|
const UnicodeSet *inclusions =
|
|
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
int32_t numRanges = inclusions->getRangeCount();
|
|
UChar32 startHasProperty = -1;
|
|
|
|
for (int32_t i = 0; i < numRanges; ++i) {
|
|
UChar32 rangeEnd = inclusions->getRangeEnd(i);
|
|
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
|
|
// TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
|
|
if (u_hasBinaryProperty(c, property)) {
|
|
if (startHasProperty < 0) {
|
|
// Transition from false to true.
|
|
startHasProperty = c;
|
|
}
|
|
} else if (startHasProperty >= 0) {
|
|
// Transition from true to false.
|
|
set->add(startHasProperty, c - 1);
|
|
startHasProperty = -1;
|
|
}
|
|
}
|
|
}
|
|
if (startHasProperty >= 0) {
|
|
set->add(startHasProperty, 0x10FFFF);
|
|
}
|
|
set->freeze();
|
|
return set.orphan();
|
|
}
|
|
|
|
UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
|
|
icu::LocalUMutableCPTriePointer mutableTrie(
|
|
umutablecptrie_open(nullValue, nullValue, &errorCode));
|
|
const UnicodeSet *inclusions =
|
|
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
int32_t numRanges = inclusions->getRangeCount();
|
|
UChar32 start = 0;
|
|
uint32_t value = nullValue;
|
|
|
|
for (int32_t i = 0; i < numRanges; ++i) {
|
|
UChar32 rangeEnd = inclusions->getRangeEnd(i);
|
|
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
|
|
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
|
|
uint32_t nextValue = u_getIntPropertyValue(c, property);
|
|
if (value != nextValue) {
|
|
if (value != nullValue) {
|
|
umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
|
|
}
|
|
start = c;
|
|
value = nextValue;
|
|
}
|
|
}
|
|
}
|
|
if (value != 0) {
|
|
umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
|
|
}
|
|
|
|
UCPTrieType type;
|
|
if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
|
|
type = UCPTRIE_TYPE_FAST;
|
|
} else {
|
|
type = UCPTRIE_TYPE_SMALL;
|
|
}
|
|
UCPTrieValueWidth valueWidth;
|
|
// TODO: UCharacterProperty.IntProperty
|
|
int32_t max = u_getIntPropertyMaxValue(property);
|
|
if (max <= 0xff) {
|
|
valueWidth = UCPTRIE_VALUE_BITS_8;
|
|
} else if (max <= 0xffff) {
|
|
valueWidth = UCPTRIE_VALUE_BITS_16;
|
|
} else {
|
|
valueWidth = UCPTRIE_VALUE_BITS_32;
|
|
}
|
|
return reinterpret_cast<UCPMap *>(
|
|
umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
|
|
}
|
|
|
|
} // namespace
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
|
|
if (U_FAILURE(errorCode)) { return nullptr; }
|
|
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
|
|
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return nullptr;
|
|
}
|
|
Mutex m(&cpMutex);
|
|
UnicodeSet *set = sets[property];
|
|
if (set == nullptr) {
|
|
sets[property] = set = makeSet(property, errorCode);
|
|
}
|
|
return set;
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|
|
U_NAMESPACE_USE
|
|
|
|
U_CAPI const USet * U_EXPORT2
|
|
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
|
|
const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
|
|
return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
|
|
}
|
|
|
|
U_CAPI const UCPMap * U_EXPORT2
|
|
u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
|
|
if (U_FAILURE(*pErrorCode)) { return nullptr; }
|
|
if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
|
|
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return nullptr;
|
|
}
|
|
Mutex m(&cpMutex);
|
|
UCPMap *map = maps[property - UCHAR_INT_START];
|
|
if (map == nullptr) {
|
|
maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
|
|
}
|
|
return map;
|
|
}
|