From 34cc6afd1cd58fc8ac8a7fc8ea2ad6256444856c Mon Sep 17 00:00:00 2001 From: Butkovits Atila Date: Fri, 15 Oct 2021 02:02:25 +0300 Subject: [PATCH] Backed out 9 changesets (bug 1719746, bug 1735341) as requested by dev. CLOSED TREE Backed out changeset f1b3e7dec7e2 (bug 1735341) Backed out changeset e7675e3524da (bug 1719746) Backed out changeset 05109157a4b5 (bug 1719746) Backed out changeset b98f10477f44 (bug 1719746) Backed out changeset dce9c0d6c79c (bug 1719746) Backed out changeset 472767f43cad (bug 1719746) Backed out changeset 4fa55bded471 (bug 1719746) Backed out changeset 391c305dce1f (bug 1719746) Backed out changeset 5f20632de2be (bug 1719746) --- .clang-format-ignore | 2 +- intl/components/gtest/TestLocale.cpp | 77 +- intl/components/moz.build | 2 - intl/components/src/DateTimeFormat.h | 2 +- .../components/src/DateTimePatternGenerator.h | 36 + intl/components/src/ICU4CGlue.h | 6 +- intl/components/src/Locale.cpp | 1469 --------------- intl/components/src/Locale.h | 666 +------ intl/components/src/LocaleCanonicalizer.cpp | 2 +- intl/components/src/RelativeTimeFormat.h | 2 +- intl/components/src/TimeZone.cpp | 6 +- intl/docs/icu.rst | 2 +- js/public/friend/ErrorNumbers.msg | 2 +- js/src/builtin/intl/Collator.cpp | 19 +- js/src/builtin/intl/DateTimeFormat.cpp | 14 +- js/src/builtin/intl/DisplayNames.cpp | 111 +- js/src/builtin/intl/FormatBuffer.h | 5 +- js/src/builtin/intl/IntlObject.cpp | 132 +- js/src/builtin/intl/LanguageTag.cpp | 1605 ++++++++++++++++- js/src/builtin/intl/LanguageTag.h | 711 +++++++- .../src/builtin/intl/LanguageTagGenerated.cpp | 218 ++- js/src/builtin/intl/Locale.cpp | 222 +-- js/src/builtin/intl/NumberFormat.cpp | 15 +- js/src/builtin/intl/RelativeTimeFormat.cpp | 14 +- js/src/builtin/intl/SharedIntlData.cpp | 8 +- js/src/builtin/intl/StringAsciiChars.h | 78 - js/src/builtin/intl/make_intl_data.py | 111 +- js/src/moz.build | 1 + 28 files changed, 2651 insertions(+), 2887 deletions(-) delete mode 100644 intl/components/src/Locale.cpp rename intl/components/src/LocaleGenerated.cpp => js/src/builtin/intl/LanguageTagGenerated.cpp (84%) delete mode 100644 js/src/builtin/intl/StringAsciiChars.h diff --git a/.clang-format-ignore b/.clang-format-ignore index 881d824e7d24..db9e8f9a46e9 100644 --- a/.clang-format-ignore +++ b/.clang-format-ignore @@ -4,7 +4,7 @@ build/clang-plugin/.* config/gcc-stl-wrapper.template.h config/msvc-stl-wrapper.template.h # Generated code -intl/components/src/LocaleGenerated.cpp +js/src/builtin/intl/LanguageTagGenerated.cpp js/src/builtin/intl/TimeZoneDataGenerated.h # Don't want to reformat irregexp (third-party code) diff --git a/intl/components/gtest/TestLocale.cpp b/intl/components/gtest/TestLocale.cpp index 4e29eabe4d19..03dde0531c9d 100644 --- a/intl/components/gtest/TestLocale.cpp +++ b/intl/components/gtest/TestLocale.cpp @@ -4,85 +4,12 @@ #include "gtest/gtest.h" #include "mozilla/intl/Locale.h" -#include "mozilla/Span.h" -#include "TestBuffer.h" +#include +#include namespace mozilla::intl { -TEST(IntlLocale, LocaleSettersAndGetters) -{ - Locale locale; - locale.setLanguage("fr"); - locale.setRegion("CA"); - locale.setScript("Latn"); - ASSERT_TRUE(locale.setUnicodeExtension("u-ca-gregory")); - ASSERT_TRUE(locale.language().equalTo("fr")); - ASSERT_TRUE(locale.region().equalTo("CA")); - ASSERT_TRUE(locale.script().equalTo("Latn")); - ASSERT_EQ(MakeStringSpan(locale.unicodeExtension()), - MakeStringSpan("u-ca-gregory")); - - TestBuffer buffer; - ASSERT_TRUE(locale.toString(buffer).isOk()); - ASSERT_TRUE(buffer.verboseMatches("fr-Latn-CA-u-ca-gregory")); - - // No setters for variants or other extensions... - Locale locale2; - ASSERT_TRUE(LocaleParser::tryParse( - MakeStringSpan("fr-CA-fonipa-t-es-AR-h0-hybrid"), locale2) - .isOk()); - ASSERT_EQ(MakeStringSpan(locale2.variants()[0].get()), - MakeStringSpan("fonipa")); - ASSERT_EQ(MakeStringSpan(locale2.extensions()[0].get()), - MakeStringSpan("t-es-AR-h0-hybrid")); - locale2.clearVariants(); - ASSERT_EQ(locale2.variants().length(), 0UL); -} - -TEST(IntlLocale, LocaleParser) -{ - const char* tags[] = { - "en-US", "en-GB", "es-AR", "it", "zh-Hans-CN", - "de-AT", "pl", "fr-FR", "de-AT", "sr-Cyrl-SR", - "nb-NO", "fr-FR", "mk", "uk", "und-PL", - "und-Latn-AM", "ug-Cyrl", "sr-ME", "mn-Mong", "lif-Limb", - "gan", "zh-Hant", "yue-Hans", "unr", "unr-Deva", - "und-Thai-CN", "ug-Cyrl", "en-Latn-DE", "pl-FR", "de-CH", - "tuq", "sr-ME", "ng", "klx", "kk-Arab", - "en-Cyrl", "und-Cyrl-UK", "und-Arab", "und-Arab-FO"}; - - Locale locale; - for (const auto* tag : tags) { - ASSERT_TRUE(LocaleParser::tryParse(MakeStringSpan(tag), locale).isOk()); - } -} - -TEST(IntlLocale, LikelySubtags) -{ - Locale locale; - ASSERT_TRUE(LocaleParser::tryParse(MakeStringSpan("zh"), locale).isOk()); - ASSERT_TRUE(locale.addLikelySubtags()); - TestBuffer buffer; - ASSERT_TRUE(locale.toString(buffer).isOk()); - ASSERT_TRUE(buffer.verboseMatches("zh-Hans-CN")); - ASSERT_TRUE(locale.removeLikelySubtags()); - buffer.clear(); - ASSERT_TRUE(locale.toString(buffer).isOk()); - ASSERT_TRUE(buffer.verboseMatches("zh")); -} - -TEST(IntlLocale, Canonicalize) -{ - Locale locale; - ASSERT_TRUE( - LocaleParser::tryParse(MakeStringSpan("nob-bokmal"), locale).isOk()); - ASSERT_TRUE(locale.canonicalize().isOk()); - TestBuffer buffer; - ASSERT_TRUE(locale.toString(buffer).isOk()); - ASSERT_TRUE(buffer.verboseMatches("nb")); -} - // These tests are dependent on the machine that this test is being run on. TEST(IntlLocale, SystemDependentTests) { diff --git a/intl/components/moz.build b/intl/components/moz.build index affeb6206766..74180218ad67 100644 --- a/intl/components/moz.build +++ b/intl/components/moz.build @@ -35,9 +35,7 @@ UNIFIED_SOURCES += [ "src/ICU4CGlue.cpp", "src/ICU4CLibrary.cpp", "src/ListFormat.cpp", - "src/Locale.cpp", "src/LocaleCanonicalizer.cpp", - "src/LocaleGenerated.cpp", "src/MeasureUnit.cpp", "src/NumberFormat.cpp", "src/NumberFormatFields.cpp", diff --git a/intl/components/src/DateTimeFormat.h b/intl/components/src/DateTimeFormat.h index bb9b4a50c107..b65778712073 100644 --- a/intl/components/src/DateTimeFormat.h +++ b/intl/components/src/DateTimeFormat.h @@ -327,7 +327,7 @@ class DateTimeFormat final { // Write the formatted date into the u16Buffer. PatternVector u16Vec; - auto result = FillBufferWithICUCall( + auto result = FillVectorWithICUCall( u16Vec, [this, &aUnixEpoch](UChar* target, int32_t length, UErrorCode* status) { return udat_format(mDateFormat, aUnixEpoch, target, length, diff --git a/intl/components/src/DateTimePatternGenerator.h b/intl/components/src/DateTimePatternGenerator.h index a5de35cee74c..fb97ebe9b4ae 100644 --- a/intl/components/src/DateTimePatternGenerator.h +++ b/intl/components/src/DateTimePatternGenerator.h @@ -76,6 +76,25 @@ class DateTimePatternGenerator final { }); } + /** + * Given a skeleton (a string with unordered datetime fields), get a best + * pattern that will fit for that locale. This pattern will be filled into the + * buffer. e.g. The skeleton "yMd" would return the pattern "M/d/y" for en-US, + * or "dd/MM/y" for en-GB. + */ + template + ICUResult GetBestPattern(Span aSkeleton, + Vector& aVector, + EnumSet options = {}) { + return FillVectorWithICUCall( + aVector, [&](UChar* target, int32_t length, UErrorCode* status) { + return udatpg_getBestPatternWithOptions( + mGenerator.GetMut(), aSkeleton.data(), + static_cast(aSkeleton.Length()), + toUDateTimePatternMatchOptions(options), target, length, status); + }); + } + /** * Get a skeleton (a string with unordered datetime fields) from a pattern. * For example, both "MMM-dd" and "dd/MMM" produce the skeleton "MMMdd". @@ -92,6 +111,23 @@ class DateTimePatternGenerator final { }); } + /** + * Get a skeleton (a string with unordered datetime fields) from a pattern. + * For example, both "MMM-dd" and "dd/MMM" produce the skeleton "MMMdd". + */ + template + static ICUResult GetSkeleton(Span aPattern, + Vector& aVector) { + // At one time udatpg_getSkeleton required a UDateTimePatternGenerator*, but + // now it is valid to pass in a nullptr. + return FillVectorWithICUCall( + aVector, [&](UChar* target, int32_t length, UErrorCode* status) { + return udatpg_getSkeleton(nullptr, aPattern.data(), + static_cast(aPattern.Length()), + target, length, status); + }); + } + /** * TODO(Bug 1686965) - Temporarily get the underlying ICU object while * migrating to the unified API. This should be removed when completing the diff --git a/intl/components/src/ICU4CGlue.h b/intl/components/src/ICU4CGlue.h index 012bb709ef83..2ccd4cb6b72f 100644 --- a/intl/components/src/ICU4CGlue.h +++ b/intl/components/src/ICU4CGlue.h @@ -159,11 +159,11 @@ class VectorToBufferAdaptor { }; /** - * An overload of FillBufferWithICUCall that accepts a mozilla::Vector rather - * than a Buffer. + * A variant of FillBufferWithICUCall that accepts a mozilla::Vector rather than + * a Buffer. */ template -static ICUResult FillBufferWithICUCall(Vector& vector, +static ICUResult FillVectorWithICUCall(Vector& vector, const ICUStringFunction& strFn) { VectorToBufferAdaptor buffer(vector); return FillBufferWithICUCall(buffer, strFn); diff --git a/intl/components/src/Locale.cpp b/intl/components/src/Locale.cpp deleted file mode 100644 index f5310c0b958d..000000000000 --- a/intl/components/src/Locale.cpp +++ /dev/null @@ -1,1469 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -#include "mozilla/intl/Locale.h" - -#include "mozilla/Assertions.h" -#include "mozilla/DebugOnly.h" -#include "mozilla/MathAlgorithms.h" -#include "mozilla/Span.h" -#include "mozilla/TextUtils.h" -#include "mozilla/Variant.h" - -#include "ICU4CGlue.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "unicode/uloc.h" -#include "unicode/utypes.h" - -namespace mozilla::intl { - -using namespace intl::LanguageTagLimits; - -template -bool IsStructurallyValidLanguageTag(Span language) { - // unicode_language_subtag = alpha{2,3} | alpha{5,8}; - size_t length = language.size(); - const CharT* str = language.data(); - return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && - std::all_of(str, str + length, IsAsciiAlpha); -} - -template bool IsStructurallyValidLanguageTag(Span language); -template bool IsStructurallyValidLanguageTag(Span language); -template bool IsStructurallyValidLanguageTag(Span language); - -template -bool IsStructurallyValidScriptTag(Span script) { - // unicode_script_subtag = alpha{4} ; - size_t length = script.size(); - const CharT* str = script.data(); - return length == 4 && std::all_of(str, str + length, IsAsciiAlpha); -} - -template bool IsStructurallyValidScriptTag(Span script); -template bool IsStructurallyValidScriptTag(Span script); -template bool IsStructurallyValidScriptTag(Span script); - -template -bool IsStructurallyValidRegionTag(Span region) { - // unicode_region_subtag = (alpha{2} | digit{3}) ; - size_t length = region.size(); - const CharT* str = region.data(); - return (length == 2 && std::all_of(str, str + length, IsAsciiAlpha)) || - (length == 3 && std::all_of(str, str + length, IsAsciiDigit)); -} - -template bool IsStructurallyValidRegionTag(Span region); -template bool IsStructurallyValidRegionTag(Span region); -template bool IsStructurallyValidRegionTag(Span region); - -#ifdef DEBUG -bool IsStructurallyValidVariantTag(Span variant) { - // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; - size_t length = variant.size(); - const char* str = variant.data(); - return ((5 <= length && length <= 8) || - (length == 4 && IsAsciiDigit(str[0]))) && - std::all_of(str, str + length, IsAsciiAlphanumeric); -} - -bool IsStructurallyValidUnicodeExtensionTag(Span extension) { - return LocaleParser::canParseUnicodeExtension(extension).isOk(); -} - -static bool IsStructurallyValidExtensionTag(Span extension) { - // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; - // NB: Allow any extension, including Unicode and Transform here, because - // this function is only used for an assertion. - - size_t length = extension.size(); - const char* str = extension.data(); - const char* const end = extension.data() + length; - if (length <= 2) { - return false; - } - if (!IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') { - return false; - } - str++; - if (*str++ != '-') { - return false; - } - while (true) { - const char* sep = - reinterpret_cast(memchr(str, '-', end - str)); - size_t len = (sep ? sep : end) - str; - if (len < 2 || len > 8 || - !std::all_of(str, str + len, IsAsciiAlphanumeric)) { - return false; - } - if (!sep) { - return true; - } - str = sep + 1; - } -} - -bool IsStructurallyValidPrivateUseTag(Span privateUse) { - // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; - - size_t length = privateUse.size(); - const char* str = privateUse.data(); - const char* const end = privateUse.data() + length; - if (length <= 2) { - return false; - } - if (str[0] != 'x' && str[0] != 'X') { - return false; - } - str++; - if (*str++ != '-') { - return false; - } - while (true) { - const char* sep = - reinterpret_cast(memchr(str, '-', end - str)); - size_t len = (sep ? sep : end) - str; - if (len == 0 || len > 8 || - !std::all_of(str, str + len, IsAsciiAlphanumeric)) { - return false; - } - if (!sep) { - return true; - } - str = sep + 1; - } -} -#endif - -ptrdiff_t Locale::unicodeExtensionIndex() const { - // The extension subtags aren't necessarily sorted, so we can't use binary - // search here. - auto p = std::find_if( - extensions().begin(), extensions().end(), - [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; }); - if (p != extensions().end()) { - return std::distance(extensions().begin(), p); - } - return -1; -} - -const char* Locale::unicodeExtension() const { - ptrdiff_t index = unicodeExtensionIndex(); - if (index >= 0) { - return extensions()[index].get(); - } - return nullptr; -} - -bool Locale::setUnicodeExtension(const char* extension) { - MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(MakeStringSpan(extension))); - - auto duplicated = DuplicateStringToUniqueChars(extension); - - // Replace the existing Unicode extension subtag or append a new one. - ptrdiff_t index = unicodeExtensionIndex(); - if (index >= 0) { - extensions_[index] = std::move(duplicated); - return true; - } - return extensions_.append(std::move(duplicated)); -} - -void Locale::clearUnicodeExtension() { - ptrdiff_t index = unicodeExtensionIndex(); - if (index >= 0) { - extensions_.erase(extensions_.begin() + index); - } -} - -template -static bool SortAlphabetically(Vector& subtags) { - size_t length = subtags.length(); - - // Zero or one element lists are already sorted. - if (length < 2) { - return true; - } - - // Handle two element lists inline. - if (length == 2) { - if (strcmp(subtags[0].get(), subtags[1].get()) > 0) { - subtags[0].swap(subtags[1]); - } - return true; - } - - Vector scratch; - if (!scratch.resizeUninitialized(length)) { - return false; - } - for (size_t i = 0; i < length; i++) { - scratch[i] = subtags[i].release(); - } - - std::stable_sort( - scratch.begin(), scratch.end(), - [](const char* a, const char* b) { return strcmp(a, b) < 0; }); - - for (size_t i = 0; i < length; i++) { - subtags[i] = UniqueChars(scratch[i]); - } - return true; -} - -Result Locale::canonicalizeBaseName() { - // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to - // canonicalize the syntax by normalizing the case and ordering all subtags. - // The canonical syntax form is specified in UTS 35, 3.2.1. - - // Language codes need to be in lower case. "JA" -> "ja" - language_.toLowerCase(); - MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); - - // The first character of a script code needs to be capitalized. - // "hans" -> "Hans" - script_.toTitleCase(); - MOZ_ASSERT(script().missing() || - IsStructurallyValidScriptTag(script().span())); - - // Region codes need to be in upper case. "bu" -> "BU" - region_.toUpperCase(); - MOZ_ASSERT(region().missing() || - IsStructurallyValidRegionTag(region().span())); - - // The canonical case for variant subtags is lowercase. - for (UniqueChars& variant : variants_) { - char* variantChars = variant.get(); - size_t variantLength = strlen(variantChars); - AsciiToLowerCase(variantChars, variantLength, variantChars); - - MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength})); - } - - // Extensions and privateuse subtags are case normalized in the - // |canonicalizeExtensions| method. - - // The second step in UTS 35, 3.2.1, is to order all subtags. - - if (variants_.length() > 1) { - // 1. Any variants are in alphabetical order. - if (!SortAlphabetically(variants_)) { - return Err(CanonicalizationError::OutOfMemory); - } - - // Reject the Locale identifier if a duplicate variant was found, e.g. - // "en-variant-Variant". - const UniqueChars* duplicate = std::adjacent_find( - variants().begin(), variants().end(), [](const auto& a, const auto& b) { - return strcmp(a.get(), b.get()) == 0; - }); - if (duplicate != variants().end()) { - return Err(CanonicalizationError::DuplicateVariant); - } - } - - // 2. Any extensions are in alphabetical order by their singleton. - // 3. All attributes are sorted in alphabetical order. - // 4. All keywords and tfields are sorted by alphabetical order of their keys, - // within their respective extensions. - // 5. Any type or tfield value "true" is removed. - // - A subsequent call to canonicalizeExtensions() will perform these steps. - - // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier - // into its canonical form per UTS 3.2.1. - - // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their - // canonical forms. - // - A subsequent call to canonicalizeExtensions() will perform this step. - - // 2. Replace aliases in the unicode_language_id and tlang (if any). - // - tlang is handled in canonicalizeExtensions(). - - // Replace deprecated language, region, and variant subtags with their - // preferred mappings. - - if (!updateLegacyMappings()) { - return Err(CanonicalizationError::OutOfMemory); - } - - // Replace deprecated language subtags with their preferred values. - if (!languageMapping(language_) && complexLanguageMapping(language_)) { - performComplexLanguageMappings(); - } - - // Replace deprecated script subtags with their preferred values. - if (script().present()) { - scriptMapping(script_); - } - - // Replace deprecated region subtags with their preferred values. - if (region().present()) { - if (!regionMapping(region_) && complexRegionMapping(region_)) { - performComplexRegionMappings(); - } - } - - // Replace deprecated variant subtags with their preferred values. - if (!performVariantMappings()) { - return Err(CanonicalizationError::OutOfMemory); - } - - // No extension replacements are currently present. - // Private use sequences are left as is. - - // 3. Replace aliases in special key values. - // - A subsequent call to canonicalizeExtensions() will perform this step. - - return Ok(); -} - -#ifdef DEBUG -static bool IsAsciiLowercaseAlphanumericOrDash(Span span) { - const char* ptr = span.data(); - size_t length = span.size(); - return std::all_of(ptr, ptr + length, [](auto c) { - return IsAsciiLowercaseAlpha(c) || IsAsciiDigit(c) || c == '-'; - }); -} -#endif - -Result Locale::canonicalizeExtensions() { - // The canonical case for all extension subtags is lowercase. - for (UniqueChars& extension : extensions_) { - char* extensionChars = extension.get(); - size_t extensionLength = strlen(extensionChars); - AsciiToLowerCase(extensionChars, extensionLength, extensionChars); - - MOZ_ASSERT( - IsStructurallyValidExtensionTag({extensionChars, extensionLength})); - } - - // Any extensions are in alphabetical order by their singleton. - // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" - if (!SortAlphabetically(extensions_)) { - return Err(CanonicalizationError::OutOfMemory); - } - - for (UniqueChars& extension : extensions_) { - if (extension[0] == 'u') { - MOZ_TRY(canonicalizeUnicodeExtension(extension)); - } else if (extension[0] == 't') { - MOZ_TRY(canonicalizeTransformExtension(extension)); - } - - MOZ_ASSERT( - IsAsciiLowercaseAlphanumericOrDash(MakeStringSpan(extension.get()))); - } - - // The canonical case for privateuse subtags is lowercase. - if (char* privateuse = privateuse_.get()) { - size_t privateuseLength = strlen(privateuse); - AsciiToLowerCase(privateuse, privateuseLength, privateuse); - - MOZ_ASSERT( - IsStructurallyValidPrivateUseTag({privateuse, privateuseLength})); - } - return Ok(); -} - -/** - * CanonicalizeUnicodeExtension( attributes, keywords ) - * - * Canonical syntax per - * : - * - * - All attributes and keywords are in lowercase. - * - Note: The parser already converted keywords to lowercase. - * - All attributes are sorted in alphabetical order. - * - All keywords are sorted by alphabetical order of their keys. - * - Any type value "true" is removed. - * - * Canonical form: - * - All keys and types use the canonical form (from the name attribute; - * see Section 3.6.4 U Extension Data Files). - */ -Result Locale::canonicalizeUnicodeExtension( - UniqueChars& unicodeExtension) { - const char* const extension = unicodeExtension.get(); - MOZ_ASSERT(extension[0] == 'u'); - MOZ_ASSERT(extension[1] == '-'); - MOZ_ASSERT(IsStructurallyValidExtensionTag(MakeStringSpan(extension))); - - size_t length = strlen(extension); - - LocaleParser::AttributesVector attributes; - LocaleParser::KeywordsVector keywords; - - using Attribute = LocaleParser::AttributesVector::ElementType; - using Keyword = LocaleParser::KeywordsVector::ElementType; - - if (LocaleParser::parseUnicodeExtension(Span(extension, length), attributes, - keywords) - .isErr()) { - MOZ_ASSERT_UNREACHABLE("unexpected invalid Unicode extension subtag"); - return Err(CanonicalizationError::InternalError); - } - - auto attributesLess = [extension](const Attribute& a, const Attribute& b) { - const char* astr = a.begin(extension); - const char* bstr = b.begin(extension); - size_t alen = a.length(); - size_t blen = b.length(); - - if (int r = - std::char_traits::compare(astr, bstr, std::min(alen, blen))) { - return r < 0; - } - return alen < blen; - }; - - // All attributes are sorted in alphabetical order. - if (attributes.length() > 1) { - std::stable_sort(attributes.begin(), attributes.end(), attributesLess); - } - - auto keywordsLess = [extension](const Keyword& a, const Keyword& b) { - const char* astr = a.begin(extension); - const char* bstr = b.begin(extension); - MOZ_ASSERT(a.length() >= UnicodeKeyLength); - MOZ_ASSERT(b.length() >= UnicodeKeyLength); - - return std::char_traits::compare(astr, bstr, UnicodeKeyLength) < 0; - }; - - // All keywords are sorted by alphabetical order of keys. - if (keywords.length() > 1) { - // Using a stable sort algorithm, guarantees that two keywords using the - // same key are never reordered. That means for example - // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to - // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs - // before "nu-latn". - // This is required so that deduplication below preserves the first keyword - // for a given key and discards the rest. - std::stable_sort(keywords.begin(), keywords.end(), keywordsLess); - } - - Vector sb; - if (!sb.append('u')) { - return Err(CanonicalizationError::OutOfMemory); - } - - // Append all Unicode extension attributes. - for (size_t i = 0; i < attributes.length(); i++) { - const auto& attribute = attributes[i]; - - // Skip duplicate attributes. - if (i > 0) { - const auto& lastAttribute = attributes[i - 1]; - if (attribute.length() == lastAttribute.length() && - std::char_traits::compare(attribute.begin(extension), - lastAttribute.begin(extension), - attribute.length()) == 0) { - continue; - } - MOZ_ASSERT(attributesLess(lastAttribute, attribute)); - } - - if (!sb.append('-')) { - return Err(CanonicalizationError::OutOfMemory); - } - if (!sb.append(attribute.begin(extension), attribute.length())) { - return Err(CanonicalizationError::OutOfMemory); - } - } - - static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; - - using StringSpan = Span; - - static auto isTrue = [](StringSpan type) { - static constexpr char True[] = "true"; - constexpr size_t TrueLength = std::char_traits::length(True); - return type.size() == TrueLength && - std::char_traits::compare(type.data(), True, TrueLength) == 0; - }; - - auto appendKey = [&sb, extension](const Keyword& keyword) { - MOZ_ASSERT(keyword.length() == UnicodeKeyLength); - return sb.append(keyword.begin(extension), UnicodeKeyLength); - }; - - auto appendKeyword = [&sb, extension](const Keyword& keyword, - StringSpan type) { - MOZ_ASSERT(keyword.length() > UnicodeKeyLength); - - // Elide the Unicode extension type "true". - if (isTrue(type)) { - return sb.append(keyword.begin(extension), UnicodeKeyLength); - } - // Otherwise append the complete Unicode extension keyword. - return sb.append(keyword.begin(extension), keyword.length()); - }; - - auto appendReplacement = [&sb, extension](const Keyword& keyword, - StringSpan replacement) { - MOZ_ASSERT(keyword.length() > UnicodeKeyLength); - - // Elide the type "true" if present in the replacement. - if (isTrue(replacement)) { - return sb.append(keyword.begin(extension), UnicodeKeyLength); - } - // Otherwise append the Unicode key (including the separator) and the - // replaced type. - return sb.append(keyword.begin(extension), UnicodeKeyWithSepLength) && - sb.append(replacement.data(), replacement.size()); - }; - - // Append all Unicode extension keywords. - for (size_t i = 0; i < keywords.length(); i++) { - const auto& keyword = keywords[i]; - - // Skip duplicate keywords. - if (i > 0) { - const auto& lastKeyword = keywords[i - 1]; - if (std::char_traits::compare(keyword.begin(extension), - lastKeyword.begin(extension), - UnicodeKeyLength) == 0) { - continue; - } - MOZ_ASSERT(keywordsLess(lastKeyword, keyword)); - } - - if (!sb.append('-')) { - return Err(CanonicalizationError::OutOfMemory); - } - - if (keyword.length() == UnicodeKeyLength) { - // Keyword without type value. - if (!appendKey(keyword)) { - return Err(CanonicalizationError::OutOfMemory); - } - } else { - StringSpan key(keyword.begin(extension), UnicodeKeyLength); - StringSpan type(keyword.begin(extension) + UnicodeKeyWithSepLength, - keyword.length() - UnicodeKeyWithSepLength); - - // Search if there's a replacement for the current Unicode keyword. - if (const char* replacement = replaceUnicodeExtensionType(key, type)) { - if (!appendReplacement(keyword, MakeStringSpan(replacement))) { - return Err(CanonicalizationError::OutOfMemory); - } - } else { - if (!appendKeyword(keyword, type)) { - return Err(CanonicalizationError::OutOfMemory); - } - } - } - } - - // We can keep the previous extension when canonicalization didn't modify it. - if (sb.length() != length || - std::char_traits::compare(sb.begin(), extension, length) != 0) { - // Null-terminate the new string and replace the previous extension. - if (!sb.append('\0')) { - return Err(CanonicalizationError::OutOfMemory); - } - UniqueChars canonical(sb.extractOrCopyRawBuffer()); - if (!canonical) { - return Err(CanonicalizationError::OutOfMemory); - } - unicodeExtension = std::move(canonical); - } - - return Ok(); -} - -template -static bool LocaleToString(const Locale& tag, Buffer& sb) { - auto appendSubtag = [&sb](const auto& subtag) { - auto span = subtag.span(); - MOZ_ASSERT(!span.empty()); - return sb.append(span.data(), span.size()); - }; - - auto appendSubtagZ = [&sb](const char* subtag) { - MOZ_ASSERT(strlen(subtag) > 0); - return sb.append(subtag, strlen(subtag)); - }; - - auto appendSubtagsZ = [&sb, &appendSubtagZ](const auto& subtags) { - for (const auto& subtag : subtags) { - if (!sb.append('-') || !appendSubtagZ(subtag.get())) { - return false; - } - } - return true; - }; - - // Append the language subtag. - if (!appendSubtag(tag.language())) { - return false; - } - - // Append the script subtag if present. - if (tag.script().present()) { - if (!sb.append('-') || !appendSubtag(tag.script())) { - return false; - } - } - - // Append the region subtag if present. - if (tag.region().present()) { - if (!sb.append('-') || !appendSubtag(tag.region())) { - return false; - } - } - - // Append the variant subtags if present. - if (!appendSubtagsZ(tag.variants())) { - return false; - } - - // Append the extensions subtags if present. - if (!appendSubtagsZ(tag.extensions())) { - return false; - } - - // Append the private-use subtag if present. - if (tag.privateuse()) { - if (!sb.append('-') || !appendSubtagZ(tag.privateuse())) { - return false; - } - } - - return true; -} - -/** - * CanonicalizeTransformExtension - * - * Canonical form per : - * - * - These subtags are all in lowercase (that is the canonical casing for these - * subtags), [...]. - * - * And per - * : - * - * - All keywords and tfields are sorted by alphabetical order of their keys, - * within their respective extensions. - */ -Result -Locale::canonicalizeTransformExtension(UniqueChars& transformExtension) { - const char* const extension = transformExtension.get(); - MOZ_ASSERT(extension[0] == 't'); - MOZ_ASSERT(extension[1] == '-'); - MOZ_ASSERT(IsStructurallyValidExtensionTag(MakeStringSpan(extension))); - - size_t length = strlen(extension); - - Locale tag; - LocaleParser::TFieldVector fields; - - using TField = LocaleParser::TFieldVector::ElementType; - - if (LocaleParser::parseTransformExtension(Span(extension, length), tag, - fields) - .isErr()) { - MOZ_ASSERT_UNREACHABLE("unexpected invalid transform extension subtag"); - return Err(CanonicalizationError::InternalError); - } - - auto tfieldLess = [extension](const TField& a, const TField& b) { - MOZ_ASSERT(a.length() > TransformKeyLength); - MOZ_ASSERT(b.length() > TransformKeyLength); - const char* astr = a.begin(extension); - const char* bstr = b.begin(extension); - return std::char_traits::compare(astr, bstr, TransformKeyLength) < 0; - }; - - // All tfields are sorted by alphabetical order of their keys. - if (fields.length() > 1) { - std::stable_sort(fields.begin(), fields.end(), tfieldLess); - } - - Vector sb; - if (!sb.append('t')) { - return Err(CanonicalizationError::OutOfMemory); - } - - // Append the language subtag if present. - // - // Replace aliases in tlang per - // . - if (tag.language().present()) { - if (!sb.append('-')) { - return Err(CanonicalizationError::OutOfMemory); - } - - MOZ_TRY(tag.canonicalizeBaseName()); - - // The canonical case for Transform extensions is lowercase per - // . Convert the two - // subtags which don't use lowercase for their canonical syntax. - tag.script_.toLowerCase(); - tag.region_.toLowerCase(); - - if (!LocaleToString(tag, sb)) { - return Err(CanonicalizationError::OutOfMemory); - } - } - - static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1; - - using StringSpan = Span; - - // Append all fields. - // - // UTS 35, 3.2.1 specifies: - // - Any type or tfield value "true" is removed. - // - // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore - // this apparently invalid part of the UTS 35 specification and simply - // append all `tfield` subtags. - for (const auto& field : fields) { - if (!sb.append('-')) { - return Err(CanonicalizationError::OutOfMemory); - } - - StringSpan key(field.begin(extension), TransformKeyLength); - StringSpan value(field.begin(extension) + TransformKeyWithSepLength, - field.length() - TransformKeyWithSepLength); - - // Search if there's a replacement for the current transform keyword. - if (const char* replacement = replaceTransformExtensionType(key, value)) { - if (!sb.append(field.begin(extension), TransformKeyWithSepLength)) { - return Err(CanonicalizationError::OutOfMemory); - } - if (!sb.append(replacement, strlen(replacement))) { - return Err(CanonicalizationError::OutOfMemory); - } - } else { - if (!sb.append(field.begin(extension), field.length())) { - return Err(CanonicalizationError::OutOfMemory); - } - } - } - - // We can keep the previous extension when canonicalization didn't modify it. - if (sb.length() != length || - std::char_traits::compare(sb.begin(), extension, length) != 0) { - // Null-terminate the new string and replace the previous extension. - if (!sb.append('\0')) { - return Err(CanonicalizationError::OutOfMemory); - } - UniqueChars canonical(sb.extractOrCopyRawBuffer()); - if (!canonical) { - return Err(CanonicalizationError::OutOfMemory); - } - transformExtension = std::move(canonical); - } - - return Ok(); -} - -// Zero-terminated ICU Locale ID. -using LocaleId = - Vector; - -enum class LikelySubtags : bool { Add, Remove }; - -// Return true iff the locale is already maximized resp. minimized. -static bool HasLikelySubtags(LikelySubtags likelySubtags, const Locale& tag) { - // The locale is already maximized if the language, script, and region - // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are - // used. - if (likelySubtags == LikelySubtags::Add) { - return !tag.language().equalTo("und") && - (tag.script().present() && !tag.script().equalTo("Zzzz")) && - (tag.region().present() && !tag.region().equalTo("ZZ")); - } - - // The locale is already minimized if it only contains a language - // subtag whose value is not the placeholder value "und". - return !tag.language().equalTo("und") && tag.script().missing() && - tag.region().missing(); -} - -// Create an ICU locale ID from the given locale. -static bool CreateLocaleForLikelySubtags(const Locale& tag, LocaleId& locale) { - MOZ_ASSERT(locale.length() == 0); - - auto appendSubtag = [&locale](const auto& subtag) { - auto span = subtag.span(); - MOZ_ASSERT(!span.empty()); - return locale.append(span.data(), span.size()); - }; - - // Append the language subtag. - if (!appendSubtag(tag.language())) { - return false; - } - - // Append the script subtag if present. - if (tag.script().present()) { - if (!locale.append('_') || !appendSubtag(tag.script())) { - return false; - } - } - - // Append the region subtag if present. - if (tag.region().present()) { - if (!locale.append('_') || !appendSubtag(tag.region())) { - return false; - } - } - - // Zero-terminated for use with ICU. - return locale.append('\0'); -} - -// Assign the language, script, and region subtags from an ICU locale ID. -// -// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to -// retrieve these subtags, but unfortunately these functions are rather slow, so -// we use our own implementation. -static bool AssignFromLocaleId(LocaleId& localeId, Locale& tag) { - MOZ_ASSERT(localeId.back() == '\0', - "Locale ID should be zero-terminated for ICU"); - - // Replace the ICU locale ID separator. - std::replace(localeId.begin(), localeId.end(), '_', '-'); - - // ICU replaces "und" with the empty string, which means "und" becomes "" and - // "und-Latn" becomes "-Latn". Handle this case separately. - if (localeId[0] == '\0' || localeId[0] == '-') { - static constexpr char und[] = "und"; - constexpr size_t length = std::char_traits::length(und); - - // Insert "und" in front of the locale ID. - if (!localeId.growBy(length)) { - return false; - } - memmove(localeId.begin() + length, localeId.begin(), localeId.length()); - memmove(localeId.begin(), und, length); - } - - Span localeSpan(localeId.begin(), localeId.length() - 1); - - // Retrieve the language, script, and region subtags from the locale ID - Locale localeTag; - if (LocaleParser::tryParseBaseName(localeSpan, localeTag).isErr()) { - return false; - } - - tag.setLanguage(localeTag.language()); - tag.setScript(localeTag.script()); - tag.setRegion(localeTag.region()); - - return true; -} - -template -static bool CallLikelySubtags(const LocaleId& localeId, LocaleId& result) { - // Locale ID must be zero-terminated before passing it to ICU. - MOZ_ASSERT(localeId.back() == '\0'); - MOZ_ASSERT(result.length() == 0); - - // Ensure there's enough room for the result. - MOZ_ALWAYS_TRUE(result.resize(LocaleId::InlineLength)); - - if (FillBufferWithICUCall(result, [&localeId](char* chars, int32_t size, - UErrorCode* status) { - return likelySubtagsFn(localeId.begin(), chars, size, status); - }).isErr()) { - return false; - } - - // Zero-terminated for use with ICU. - return result.append('\0'); -} - -// The canonical way to compute the Unicode BCP 47 locale identifier with likely -// subtags is as follows: -// -// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU -// locale ID. -// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID. -// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into -// a Unicode BCP 47 locale identifier. -// -// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow -// and we know, by construction, that the input Unicode BCP 47 locale identifier -// only contains valid language, script, and region subtags, we can avoid both -// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and -// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of -// |Intl.Locale.prototype.maximize|.) -static bool LikelySubtags(LikelySubtags likelySubtags, Locale& tag) { - // Return early if the input is already maximized/minimized. - if (HasLikelySubtags(likelySubtags, tag)) { - return true; - } - - // Create the locale ID for the input argument. - LocaleId locale; - if (!CreateLocaleForLikelySubtags(tag, locale)) { - return false; - } - - // Either add or remove likely subtags to/from the locale ID. - LocaleId localeLikelySubtags; - if (likelySubtags == LikelySubtags::Add) { - if (!CallLikelySubtags(locale, - localeLikelySubtags)) { - return false; - } - } else { - if (!CallLikelySubtags(locale, localeLikelySubtags)) { - return false; - } - } - - // Assign the language, script, and region subtags from the locale ID. - if (!AssignFromLocaleId(localeLikelySubtags, tag)) { - return false; - } - - // Update mappings in case ICU returned a non-canonical locale. - return tag.canonicalizeBaseName().isOk(); -} - -bool Locale::addLikelySubtags() { - return LikelySubtags(LikelySubtags::Add, *this); -} - -bool Locale::removeLikelySubtags() { - return LikelySubtags(LikelySubtags::Remove, *this); -} - -UniqueChars Locale::DuplicateStringToUniqueChars(const char* s) { - size_t length = strlen(s) + 1; - auto duplicate = MakeUnique(length); - memcpy(duplicate.get(), s, length); - return duplicate; -} - -size_t Locale::toStringCapacity() const { - // This is a bit awkward, the buffer class currently does not support - // being resized, so we need to calculate the required size up front and - // reserve it all at once. - auto lengthSubtag = [](const auto& subtag) { - auto span = subtag.span(); - MOZ_ASSERT(!span.empty()); - return span.size(); - }; - - auto lengthSubtagZ = [](const char* subtag) { - size_t length = strlen(subtag); - MOZ_ASSERT(length > 0); - return length; - }; - - auto lengthSubtagsZ = [&lengthSubtagZ](const auto& subtags) { - size_t length = 0; - for (const auto& subtag : subtags) { - length += lengthSubtagZ(subtag.get()) + 1; - } - return length; - }; - - // First calculate required capacity - size_t capacity = 0; - - capacity += lengthSubtag(language_); - - if (script_.present()) { - capacity += lengthSubtag(script_) + 1; - } - - if (region_.present()) { - capacity += lengthSubtag(region_) + 1; - } - - capacity += lengthSubtagsZ(variants_); - - capacity += lengthSubtagsZ(extensions_); - - if (privateuse_.get()) { - capacity += lengthSubtagZ(privateuse_.get()) + 1; - } - - return capacity; -} - -size_t Locale::toStringAppend(char* buffer) const { - // Current write position inside buffer. - size_t offset = 0; - - auto appendHyphen = [&offset, &buffer]() { - buffer[offset] = '-'; - offset += 1; - }; - - auto appendSubtag = [&offset, &buffer](const auto& subtag) { - auto span = subtag.span(); - memcpy(buffer + offset, span.data(), span.size()); - offset += span.size(); - }; - - auto appendSubtagZ = [&offset, &buffer](const char* subtag) { - size_t length = strlen(subtag); - memcpy(buffer + offset, subtag, length); - offset += length; - }; - - auto appendSubtagsZ = [&appendHyphen, &appendSubtagZ](const auto& subtags) { - for (const auto& subtag : subtags) { - appendHyphen(); - appendSubtagZ(subtag.get()); - } - }; - - // Append the language subtag. - appendSubtag(language_); - - // Append the script subtag if present. - if (script_.present()) { - appendHyphen(); - appendSubtag(script_); - } - - // Append the region subtag if present. - if (region_.present()) { - appendHyphen(); - appendSubtag(region_); - } - - // Append the variant subtags if present. - appendSubtagsZ(variants_); - - // Append the extensions subtags if present. - appendSubtagsZ(extensions_); - - // Append the private-use subtag if present. - if (privateuse_.get()) { - appendHyphen(); - appendSubtagZ(privateuse_.get()); - } - - return offset; -} - -LocaleParser::Token LocaleParser::nextToken() { - MOZ_ASSERT(index_ <= length_ + 1, "called after 'None' token was read"); - - TokenKind kind = TokenKind::None; - size_t tokenLength = 0; - for (size_t i = index_; i < length_; i++) { - // UTS 35, section 3.1. - // alpha = [A-Z a-z] ; - // digit = [0-9] ; - char c = charAt(i); - if (IsAsciiAlpha(c)) { - kind |= TokenKind::Alpha; - } else if (IsAsciiDigit(c)) { - kind |= TokenKind::Digit; - } else if (c == '-' && i > index_ && i + 1 < length_) { - break; - } else { - return {TokenKind::Error, 0, 0}; - } - tokenLength += 1; - } - - Token token{kind, index_, tokenLength}; - index_ += tokenLength + 1; - return token; -} - -UniqueChars LocaleParser::chars(size_t index, size_t length) const { - // Add +1 to null-terminate the string. - auto chars = MakeUnique(length + 1); - char* dest = chars.get(); - std::copy_n(locale_ + index, length, dest); - dest[length] = '\0'; - return chars; -} - -// Parse the `unicode_language_id` production. -// -// unicode_language_id = unicode_language_subtag -// (sep unicode_script_subtag)? -// (sep unicode_region_subtag)? -// (sep unicode_variant_subtag)* ; -// -// sep = "-" -// -// Note: Unicode CLDR locale identifier backward compatibility extensions -// removed from `unicode_language_id`. -// -// |tok| is the current token from |ts|. -// -// All subtags will be added unaltered to |tag|, without canonicalizing their -// case or, in the case of variant subtags, detecting and rejecting duplicate -// variants. Users must subsequently |canonicalizeBaseName| to perform these -// actions. -// -// Do not use this function directly: use |parseBaseName| or -// |parseTlangFromTransformExtension| instead. -Result LocaleParser::internalParseBaseName( - LocaleParser& ts, Locale& tag, Token& tok) { - if (ts.isLanguage(tok)) { - ts.copyChars(tok, tag.language_); - - tok = ts.nextToken(); - } else { - // The language subtag is mandatory. - return Err(ParserError::NotParseable); - } - - if (ts.isScript(tok)) { - ts.copyChars(tok, tag.script_); - - tok = ts.nextToken(); - } - - if (ts.isRegion(tok)) { - ts.copyChars(tok, tag.region_); - - tok = ts.nextToken(); - } - - auto& variants = tag.variants_; - MOZ_ASSERT(variants.length() == 0); - while (ts.isVariant(tok)) { - auto variant = ts.chars(tok); - if (!variants.append(std::move(variant))) { - return Err(ParserError::OutOfMemory); - } - - tok = ts.nextToken(); - } - - return Ok(); -} - -Result LocaleParser::tryParse( - mozilla::Span locale, Locale& tag) { - // unicode_locale_id = unicode_language_id - // extensions* - // pu_extensions? ; - - LocaleParser ts(locale); - Token tok = ts.nextToken(); - - MOZ_TRY(parseBaseName(ts, tag, tok)); - - // extensions = unicode_locale_extensions - // | transformed_extensions - // | other_extensions ; - - // Bit set of seen singletons. - uint64_t seenSingletons = 0; - - auto& extensions = tag.extensions_; - while (ts.isExtensionStart(tok)) { - char singleton = ts.singletonKey(tok); - - // Reject the input if a duplicate singleton was found. - uint64_t hash = 1ULL << (AsciiAlphanumericToNumber(singleton) + 1); - if (seenSingletons & hash) { - return Err(ParserError::NotParseable); - } - seenSingletons |= hash; - - Token start = tok; - tok = ts.nextToken(); - - // We'll check for missing non-singleton subtags after this block by - // comparing |startValue| with the then-current position. - size_t startValue = tok.index(); - - if (singleton == 'u') { - while (ts.isUnicodeExtensionPart(tok)) { - tok = ts.nextToken(); - } - } else if (singleton == 't') { - // transformed_extensions = sep [tT] - // ((sep tlang (sep tfield)*) - // | (sep tfield)+) ; - - // tlang = unicode_language_subtag - // (sep unicode_script_subtag)? - // (sep unicode_region_subtag)? - // (sep unicode_variant_subtag)* ; - if (ts.isLanguage(tok)) { - tok = ts.nextToken(); - - if (ts.isScript(tok)) { - tok = ts.nextToken(); - } - - if (ts.isRegion(tok)) { - tok = ts.nextToken(); - } - - while (ts.isVariant(tok)) { - tok = ts.nextToken(); - } - } - - // tfield = tkey tvalue; - while (ts.isTransformExtensionKey(tok)) { - tok = ts.nextToken(); - - size_t startTValue = tok.index(); - while (ts.isTransformExtensionPart(tok)) { - tok = ts.nextToken(); - } - - // `tfield` requires at least one `tvalue`. - if (tok.index() <= startTValue) { - return Err(ParserError::NotParseable); - } - } - } else { - while (ts.isOtherExtensionPart(tok)) { - tok = ts.nextToken(); - } - } - - // Singletons must be followed by a non-singleton subtag, "en-a-b" is not - // allowed. - if (tok.index() <= startValue) { - return Err(ParserError::NotParseable); - } - - UniqueChars extension = ts.extension(start, tok); - if (!extensions.append(std::move(extension))) { - return Err(ParserError::OutOfMemory); - } - } - - // Trailing `pu_extension` component of the `unicode_locale_id` production. - if (ts.isPrivateUseStart(tok)) { - Token start = tok; - tok = ts.nextToken(); - - size_t startValue = tok.index(); - while (ts.isPrivateUsePart(tok)) { - tok = ts.nextToken(); - } - - // There must be at least one subtag after the "-x-". - if (tok.index() <= startValue) { - return Err(ParserError::NotParseable); - } - - UniqueChars privateUse = ts.extension(start, tok); - tag.privateuse_ = std::move(privateUse); - } - - if (!tok.isNone()) { - return Err(ParserError::NotParseable); - } - - return Ok(); -} - -Result LocaleParser::tryParseBaseName( - Span locale, Locale& tag) { - LocaleParser ts(locale); - Token tok = ts.nextToken(); - - MOZ_TRY(parseBaseName(ts, tag, tok)); - if (!tok.isNone()) { - return Err(ParserError::NotParseable); - } - - return Ok(); -} - -// Parse |extension|, which must be a valid `transformed_extensions` subtag, and -// fill |tag| and |fields| from the `tlang` and `tfield` components. -Result LocaleParser::parseTransformExtension( - Span extension, Locale& tag, TFieldVector& fields) { - LocaleParser ts(extension); - Token tok = ts.nextToken(); - - if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 't') { - return Err(ParserError::NotParseable); - } - - tok = ts.nextToken(); - - if (tok.isNone()) { - return Err(ParserError::NotParseable); - } - - if (ts.isLanguage(tok)) { - // We're parsing a possible `tlang` in a known-valid transform extension, so - // use the special-purpose function that takes advantage of this to compute - // lowercased |tag| contents in an optimal manner. - MOZ_TRY(parseTlangInTransformExtension(ts, tag, tok)); - - // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end - // of the transform extension. - MOZ_ASSERT(ts.isTransformExtensionKey(tok) || tok.isNone()); - } else { - // If there's no `tlang` subtag, at least one `tfield` must be present. - MOZ_ASSERT(ts.isTransformExtensionKey(tok)); - } - - // Trailing `tfield` subtags. (Any other trailing subtags are an error, - // because we're guaranteed to only see a valid tranform extension here.) - while (ts.isTransformExtensionKey(tok)) { - size_t begin = tok.index(); - tok = ts.nextToken(); - - size_t startTValue = tok.index(); - while (ts.isTransformExtensionPart(tok)) { - tok = ts.nextToken(); - } - - // `tfield` requires at least one `tvalue`. - if (tok.index() <= startTValue) { - return Err(ParserError::NotParseable); - } - - size_t length = tok.index() - 1 - begin; - if (!fields.emplaceBack(begin, length)) { - return Err(ParserError::OutOfMemory); - } - } - - if (!tok.isNone()) { - return Err(ParserError::NotParseable); - } - - return Ok(); -} - -// Parse |extension|, which must be a valid `unicode_locale_extensions` subtag, -// and fill |attributes| and |keywords| from the `attribute` and `keyword` -// components. -Result LocaleParser::parseUnicodeExtension( - Span extension, AttributesVector& attributes, - KeywordsVector& keywords) { - LocaleParser ts(extension); - Token tok = ts.nextToken(); - - // unicode_locale_extensions = sep [uU] ((sep keyword)+ | - // (sep attribute)+ (sep keyword)*) ; - - if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') { - return Err(ParserError::NotParseable); - } - - tok = ts.nextToken(); - - if (tok.isNone()) { - return Err(ParserError::NotParseable); - } - - while (ts.isUnicodeExtensionAttribute(tok)) { - if (!attributes.emplaceBack(tok.index(), tok.length())) { - return Err(ParserError::OutOfMemory); - } - - tok = ts.nextToken(); - } - - // keyword = key (sep type)? ; - while (ts.isUnicodeExtensionKey(tok)) { - size_t begin = tok.index(); - tok = ts.nextToken(); - - while (ts.isUnicodeExtensionType(tok)) { - tok = ts.nextToken(); - } - - if (tok.isError()) { - return Err(ParserError::NotParseable); - } - - size_t length = tok.index() - 1 - begin; - if (!keywords.emplaceBack(begin, length)) { - return Err(ParserError::OutOfMemory); - } - } - - if (!tok.isNone()) { - return Err(ParserError::NotParseable); - } - - return Ok(); -} - -Result LocaleParser::canParseUnicodeExtension( - Span extension) { - LocaleParser ts(extension); - Token tok = ts.nextToken(); - - // unicode_locale_extensions = sep [uU] ((sep keyword)+ | - // (sep attribute)+ (sep keyword)*) ; - - if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') { - return Err(ParserError::NotParseable); - } - - tok = ts.nextToken(); - - if (tok.isNone()) { - return Err(ParserError::NotParseable); - } - - while (ts.isUnicodeExtensionAttribute(tok)) { - tok = ts.nextToken(); - } - - // keyword = key (sep type)? ; - while (ts.isUnicodeExtensionKey(tok)) { - tok = ts.nextToken(); - - while (ts.isUnicodeExtensionType(tok)) { - tok = ts.nextToken(); - } - - if (tok.isError()) { - return Err(ParserError::NotParseable); - } - } - - if (!tok.isNone()) { - return Err(ParserError::OutOfMemory); - } - - return Ok(); -} - -Result -LocaleParser::canParseUnicodeExtensionType(Span unicodeType) { - MOZ_ASSERT(!unicodeType.empty(), "caller must exclude empty strings"); - - LocaleParser ts(unicodeType); - Token tok = ts.nextToken(); - - while (ts.isUnicodeExtensionType(tok)) { - tok = ts.nextToken(); - } - - if (!tok.isNone()) { - return Err(ParserError::NotParseable); - } - - return Ok(); -} - -} // namespace mozilla::intl diff --git a/intl/components/src/Locale.h b/intl/components/src/Locale.h index e57158630c63..a72fd7aa0e09 100644 --- a/intl/components/src/Locale.h +++ b/intl/components/src/Locale.h @@ -2,407 +2,22 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -/* Structured representation of Unicode locale IDs used with Intl functions. */ - -#ifndef intl_components_Locale_h -#define intl_components_Locale_h - -#include "mozilla/Assertions.h" -#include "mozilla/intl/ICUError.h" -#include "mozilla/intl/ICU4CGlue.h" -#include "mozilla/Span.h" -#include "mozilla/TextUtils.h" -#include "mozilla/TypedEnumBits.h" -#include "mozilla/Variant.h" -#include "mozilla/Vector.h" -#include "mozilla/Result.h" - -#include -#include -#include -#include -#include +#ifndef intl_components_Locale_h_ +#define intl_components_Locale_h_ #include "unicode/uloc.h" +#include "mozilla/intl/ICU4CGlue.h" + namespace mozilla::intl { -/** - * Return true if |language| is a valid language subtag. - */ -template -bool IsStructurallyValidLanguageTag(mozilla::Span language); - -/** - * Return true if |script| is a valid script subtag. - */ -template -bool IsStructurallyValidScriptTag(mozilla::Span script); - -/** - * Return true if |region| is a valid region subtag. - */ -template -bool IsStructurallyValidRegionTag(mozilla::Span region); - -#ifdef DEBUG -/** - * Return true if |variant| is a valid variant subtag. - */ -bool IsStructurallyValidVariantTag(mozilla::Span variant); - -/** - * Return true if |extension| is a valid Unicode extension subtag. - */ -bool IsStructurallyValidUnicodeExtensionTag( - mozilla::Span extension); - -/** - * Return true if |privateUse| is a valid private-use subtag. - */ -bool IsStructurallyValidPrivateUseTag(mozilla::Span privateUse); - -#endif - -template -char AsciiToLowerCase(CharT c) { - MOZ_ASSERT(mozilla::IsAscii(c)); - return mozilla::IsAsciiUppercaseAlpha(c) ? (c + 0x20) : c; -} - -template -char AsciiToUpperCase(CharT c) { - MOZ_ASSERT(mozilla::IsAscii(c)); - return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c; -} - -template -void AsciiToLowerCase(CharT* chars, size_t length, char* dest) { - char (&fn)(CharT) = AsciiToLowerCase; - std::transform(chars, chars + length, dest, fn); -} - -template -void AsciiToUpperCase(CharT* chars, size_t length, char* dest) { - char (&fn)(CharT) = AsciiToUpperCase; - std::transform(chars, chars + length, dest, fn); -} - -template -void AsciiToTitleCase(CharT* chars, size_t length, char* dest) { - if (length > 0) { - AsciiToUpperCase(chars, 1, dest); - AsciiToLowerCase(chars + 1, length - 1, dest + 1); - } -} - -// Constants for language subtag lengths. -namespace LanguageTagLimits { - -// unicode_language_subtag = alpha{2,3} | alpha{5,8} ; -static constexpr size_t LanguageLength = 8; - -// unicode_script_subtag = alpha{4} ; -static constexpr size_t ScriptLength = 4; - -// unicode_region_subtag = (alpha{2} | digit{3}) ; -static constexpr size_t RegionLength = 3; -static constexpr size_t AlphaRegionLength = 2; -static constexpr size_t DigitRegionLength = 3; - -// key = alphanum alpha ; -static constexpr size_t UnicodeKeyLength = 2; - -// tkey = alpha digit ; -static constexpr size_t TransformKeyLength = 2; - -} // namespace LanguageTagLimits - -// Fixed size language subtag which is stored inline in Locale. -template -class LanguageTagSubtag final { - uint8_t length_ = 0; - char chars_[Length] = {}; // zero initialize - +class Locale final { public: - LanguageTagSubtag() = default; - - LanguageTagSubtag(const LanguageTagSubtag&) = delete; - LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete; - - size_t length() const { return length_; } - bool missing() const { return length_ == 0; } - bool present() const { return length_ > 0; } - - mozilla::Span span() const { return {chars_, length_}; } - - template - void set(mozilla::Span str) { - MOZ_ASSERT(str.size() <= Length); - std::copy_n(str.data(), str.size(), chars_); - length_ = str.size(); - } - - // The toXYZCase() methods are using |Length| instead of |length()|, because - // current compilers (tested GCC and Clang) can't infer the maximum string - // length - even when using hints like |std::min| - and instead are emitting - // SIMD optimized code. Using a fixed sized length avoids emitting the SIMD - // code. (Emitting SIMD code doesn't make sense here, because the SIMD code - // only kicks in for long strings.) A fixed length will additionally ensure - // the compiler unrolls the loop in the case conversion code. - - void toLowerCase() { AsciiToLowerCase(chars_, Length, chars_); } - - void toUpperCase() { AsciiToUpperCase(chars_, Length, chars_); } - - void toTitleCase() { AsciiToTitleCase(chars_, Length, chars_); } - - template - bool equalTo(const char (&str)[N]) const { - static_assert(N - 1 <= Length, - "subtag literals must not exceed the maximum subtag length"); - - return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0; - } -}; - -using LanguageSubtag = LanguageTagSubtag; -using ScriptSubtag = LanguageTagSubtag; -using RegionSubtag = LanguageTagSubtag; - -using Latin1Char = unsigned char; -using UniqueChars = UniquePtr; - -/** - * Object representing a Unicode BCP 47 locale identifier. - * - * All subtags are already in canonicalized case. - */ -class MOZ_STACK_CLASS Locale final { - LanguageSubtag language_ = {}; - ScriptSubtag script_ = {}; - RegionSubtag region_ = {}; - - using VariantsVector = Vector; - using ExtensionsVector = Vector; - - VariantsVector variants_; - ExtensionsVector extensions_; - UniqueChars privateuse_ = nullptr; - - friend class LocaleParser; - - public: - enum class CanonicalizationError : uint8_t { - DuplicateVariant, - InternalError, - OutOfMemory, - }; - - private: - Result canonicalizeUnicodeExtension( - UniqueChars& unicodeExtension); - - Result canonicalizeTransformExtension( - UniqueChars& transformExtension); - - public: - static bool languageMapping(LanguageSubtag& language); - static bool complexLanguageMapping(const LanguageSubtag& language); - - private: - static bool scriptMapping(ScriptSubtag& script); - static bool regionMapping(RegionSubtag& region); - static bool complexRegionMapping(const RegionSubtag& region); - - void performComplexLanguageMappings(); - void performComplexRegionMappings(); - [[nodiscard]] bool performVariantMappings(); - - [[nodiscard]] bool updateLegacyMappings(); - - static bool signLanguageMapping(LanguageSubtag& language, - const RegionSubtag& region); - - static const char* replaceTransformExtensionType( - mozilla::Span key, mozilla::Span type); - - public: - /** - * Given a Unicode key and type, return the null-terminated preferred - * replacement for that type if there is one, or null if there is none, e.g. - * in effect - * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"| - * and - * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|. - */ - static const char* replaceUnicodeExtensionType( - mozilla::Span key, mozilla::Span type); - - public: - Locale() = default; - Locale(const Locale&) = delete; - Locale& operator=(const Locale&) = delete; - - const LanguageSubtag& language() const { return language_; } - const ScriptSubtag& script() const { return script_; } - const RegionSubtag& region() const { return region_; } - const auto& variants() const { return variants_; } - const auto& extensions() const { return extensions_; } - const char* privateuse() const { return privateuse_.get(); } - - /** - * Return the Unicode extension subtag or nullptr if not present. - */ - const char* unicodeExtension() const; - - private: - ptrdiff_t unicodeExtensionIndex() const; - - public: - /** - * Set the language subtag. The input must be a valid language subtag. - */ - template - void setLanguage(const char (&language)[N]) { - mozilla::Span span(language, N - 1); - MOZ_ASSERT(IsStructurallyValidLanguageTag(span)); - language_.set(span); - } - - /** - * Set the language subtag. The input must be a valid language subtag. - */ - void setLanguage(const LanguageSubtag& language) { - MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span())); - language_.set(language.span()); - } - - /** - * Set the script subtag. The input must be a valid script subtag. - */ - template - void setScript(const char (&script)[N]) { - mozilla::Span span(script, N - 1); - MOZ_ASSERT(IsStructurallyValidScriptTag(span)); - script_.set(span); - } - - /** - * Set the script subtag. The input must be a valid script subtag or the empty - * string. - */ - void setScript(const ScriptSubtag& script) { - MOZ_ASSERT(script.missing() || IsStructurallyValidScriptTag(script.span())); - script_.set(script.span()); - } - - /** - * Set the region subtag. The input must be a valid region subtag. - */ - template - void setRegion(const char (®ion)[N]) { - mozilla::Span span(region, N - 1); - MOZ_ASSERT(IsStructurallyValidRegionTag(span)); - region_.set(span); - } - - /** - * Set the region subtag. The input must be a valid region subtag or the empty - * empty string. - */ - void setRegion(const RegionSubtag& region) { - MOZ_ASSERT(region.missing() || IsStructurallyValidRegionTag(region.span())); - region_.set(region.span()); - } - - /** - * Removes all variant subtags. - */ - void clearVariants() { variants_.clearAndFree(); } - - /** - * Set the Unicode extension subtag. The input must be a valid Unicode - * extension subtag. - */ - [[nodiscard]] bool setUnicodeExtension(const char* extension); - - /** - * Remove any Unicode extension subtag if present. - */ - void clearUnicodeExtension(); - - /** Canonicalize the base-name (language, script, region, variant) subtags. */ - Result canonicalizeBaseName(); - - /** - * Canonicalize all extension subtags. - */ - Result canonicalizeExtensions(); - - /** - * Canonicalizes the given structurally valid Unicode BCP 47 locale - * identifier, including regularized case of subtags. For example, the - * locale Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, - * where - * - * Zh ; 2*3ALPHA - * -haNS ; ["-" script] - * -bu ; ["-" region] - * -variant2 ; *("-" variant) - * -Variant1 - * -u-ca-chinese ; *("-" extension) - * -t-Zh-laTN - * -x-PRIVATE ; ["-" privateuse] - * - * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private - * - * Spec: ECMAScript Internationalization API Specification, 6.2.3. - */ - Result canonicalize() { - MOZ_TRY(canonicalizeBaseName()); - return canonicalizeExtensions(); - } - - /** - * Fill the buffer with a string representation of the locale. - */ - template - Result toString(B& buffer) const { - static_assert(std::is_same_v); - - size_t capacity = toStringCapacity(); - - // Attempt to reserve needed capacity - if (!buffer.reserve(capacity)) { - return Err(ICUError::OutOfMemory); - } - - size_t offset = toStringAppend(buffer.data()); - - MOZ_ASSERT(capacity == offset); - buffer.written(offset); - - return Ok(); - } - - /** - * Add likely-subtags to the locale. - * - * Spec: - */ - [[nodiscard]] bool addLikelySubtags(); - - /** - * Remove likely-subtags from the locale. - * - * Spec: - */ - [[nodiscard]] bool removeLikelySubtags(); + Locale() = delete; /** * Returns the default locale as an ICU locale identifier. The returned string - * is NOT a valid BCP 47 locale! + * is NOT a valid BCP 47 language tag! * * Also see . */ @@ -420,273 +35,8 @@ class MOZ_STACK_CLASS Locale final { return AvailableLocalesEnumeration(); } - - private: - static UniqueChars DuplicateStringToUniqueChars(const char* s); - size_t toStringCapacity() const; - size_t toStringAppend(char* buffer) const; }; -/** - * Parser for Unicode BCP 47 locale identifiers. - * - * - */ -class MOZ_STACK_CLASS LocaleParser final { - public: - enum class ParserError : uint8_t { - // Input was not parseable as a locale, subtag or extension. - NotParseable, - // Unable to allocate memory for the parser to operate. - OutOfMemory, - }; - - // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|. - enum class TokenKind : uint8_t { - None = 0b000, - Alpha = 0b001, - Digit = 0b010, - AlphaDigit = 0b011, - Error = 0b100 - }; - - private: - class Token final { - size_t index_; - size_t length_; - TokenKind kind_; - - public: - Token(TokenKind kind, size_t index, size_t length) - : index_(index), length_(length), kind_(kind) {} - - TokenKind kind() const { return kind_; } - size_t index() const { return index_; } - size_t length() const { return length_; } - - bool isError() const { return kind_ == TokenKind::Error; } - bool isNone() const { return kind_ == TokenKind::None; } - bool isAlpha() const { return kind_ == TokenKind::Alpha; } - bool isDigit() const { return kind_ == TokenKind::Digit; } - bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; } - }; - - const char* locale_; - size_t length_; - size_t index_ = 0; - - explicit LocaleParser(Span locale) - : locale_(locale.data()), length_(locale.size()) {} - - char charAt(size_t index) const { return locale_[index]; } - - // Copy the token characters into |subtag|. - template - void copyChars(const Token& tok, LanguageTagSubtag& subtag) const { - subtag.set(mozilla::Span(locale_ + tok.index(), tok.length())); - } - - // Create a string copy of |length| characters starting at |index|. - UniqueChars chars(size_t index, size_t length) const; - - // Create a string copy of the token characters. - UniqueChars chars(const Token& tok) const { - return chars(tok.index(), tok.length()); - } - - UniqueChars extension(const Token& start, const Token& end) const { - MOZ_ASSERT(start.index() < end.index()); - - size_t length = end.index() - 1 - start.index(); - return chars(start.index(), length); - } - - Token nextToken(); - - // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; - // - // Four character language subtags are not allowed in Unicode BCP 47 locale - // identifiers. Also see the comparison to Unicode CLDR locale identifiers in - // . - bool isLanguage(const Token& tok) const { - return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) || - (5 <= tok.length() && tok.length() <= 8)); - } - - // unicode_script_subtag = alpha{4} ; - bool isScript(const Token& tok) const { - return tok.isAlpha() && tok.length() == 4; - } - - // unicode_region_subtag = (alpha{2} | digit{3}) ; - bool isRegion(const Token& tok) const { - return (tok.isAlpha() && tok.length() == 2) || - (tok.isDigit() && tok.length() == 3); - } - - // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; - bool isVariant(const Token& tok) const { - return (5 <= tok.length() && tok.length() <= 8) || - (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index()))); - } - - // Returns the code unit of the first character at the given singleton token. - // Always returns the lower case form of an alphabetical character. - char singletonKey(const Token& tok) const { - MOZ_ASSERT(tok.length() == 1); - return AsciiToLowerCase(charAt(tok.index())); - } - - // extensions = unicode_locale_extensions | - // transformed_extensions | - // other_extensions ; - // - // unicode_locale_extensions = sep [uU] ((sep keyword)+ | - // (sep attribute)+ (sep keyword)*) ; - // - // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | - // (sep tfield)+) ; - // - // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; - bool isExtensionStart(const Token& tok) const { - return tok.length() == 1 && singletonKey(tok) != 'x'; - } - - // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; - bool isOtherExtensionPart(const Token& tok) const { - return 2 <= tok.length() && tok.length() <= 8; - } - - // unicode_locale_extensions = sep [uU] ((sep keyword)+ | - // (sep attribute)+ (sep keyword)*) ; - // keyword = key (sep type)? ; - bool isUnicodeExtensionPart(const Token& tok) const { - return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) || - isUnicodeExtensionAttribute(tok); - } - - // attribute = alphanum{3,8} ; - bool isUnicodeExtensionAttribute(const Token& tok) const { - return 3 <= tok.length() && tok.length() <= 8; - } - - // key = alphanum alpha ; - bool isUnicodeExtensionKey(const Token& tok) const { - return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1)); - } - - // type = alphanum{3,8} (sep alphanum{3,8})* ; - bool isUnicodeExtensionType(const Token& tok) const { - return 3 <= tok.length() && tok.length() <= 8; - } - - // tkey = alpha digit ; - bool isTransformExtensionKey(const Token& tok) const { - return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) && - mozilla::IsAsciiDigit(charAt(tok.index() + 1)); - } - - // tvalue = (sep alphanum{3,8})+ ; - bool isTransformExtensionPart(const Token& tok) const { - return 3 <= tok.length() && tok.length() <= 8; - } - - // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; - bool isPrivateUseStart(const Token& tok) const { - return tok.length() == 1 && singletonKey(tok) == 'x'; - } - - // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; - bool isPrivateUsePart(const Token& tok) const { - return 1 <= tok.length() && tok.length() <= 8; - } - - // Helper function for use in |parseBaseName| and - // |parseTlangInTransformExtension|. Do not use this directly! - static Result internalParseBaseName(LocaleParser& ts, - Locale& tag, Token& tok); - - // Parse the `unicode_language_id` production, i.e. the - // language/script/region/variants portion of a locale, into |tag|. - // |tok| must be the current token. - static Result parseBaseName(LocaleParser& ts, Locale& tag, - Token& tok) { - return internalParseBaseName(ts, tag, tok); - } - - // Parse the `tlang` production within a parsed 't' transform extension. - // The precise requirements for "previously parsed" are: - // - // * the input begins from current token |tok| with a valid `tlang` - // * the `tlang` is wholly lowercase (*not* canonical case) - // * variant subtags in the `tlang` may contain duplicates and be - // unordered - // - // Return an error on internal failure. Otherwise, return a success value. If - // there was no `tlang`, then |tag.language().missing()|. But if there was a - // `tlang`, then |tag| is filled with subtags exactly as they appeared in the - // parse input. - static Result parseTlangInTransformExtension( - LocaleParser& ts, Locale& tag, Token& tok) { - MOZ_ASSERT(ts.isLanguage(tok)); - return internalParseBaseName(ts, tag, tok); - } - - friend class Locale; - - class Range final { - size_t begin_; - size_t length_; - - public: - Range(size_t begin, size_t length) : begin_(begin), length_(length) {} - - template - T* begin(T* ptr) const { - return ptr + begin_; - } - - size_t length() const { return length_; } - }; - - using TFieldVector = Vector; - using AttributesVector = Vector; - using KeywordsVector = Vector; - - // Parse |extension|, which must be a validated, fully lowercase - // `transformed_extensions` subtag, and fill |tag| and |fields| from the - // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent - // with |extension|. - static Result parseTransformExtension( - mozilla::Span extension, Locale& tag, TFieldVector& fields); - - // Parse |extension|, which must be a validated, fully lowercase - // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords| - // from the `attribute` and `keyword` components. - static Result parseUnicodeExtension( - mozilla::Span extension, AttributesVector& attributes, - KeywordsVector& keywords); - - public: - // Parse the input string as a locale. - static Result tryParse(Span locale, Locale& tag); - - // Parse the input string as the base-name parts (language, script, region, - // variants) of a locale. - static Result tryParseBaseName(Span locale, - Locale& tag); - - // Return Ok() iff |extension| can be parsed as a Unicode extension subtag. - static Result canParseUnicodeExtension( - Span extension); - - // Return Ok() iff |unicodeType| can be parsed as a Unicode extension type. - static Result canParseUnicodeExtensionType( - Span unicodeType); -}; - -MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LocaleParser::TokenKind) - } // namespace mozilla::intl -#endif /* intl_components_Locale_h */ +#endif diff --git a/intl/components/src/LocaleCanonicalizer.cpp b/intl/components/src/LocaleCanonicalizer.cpp index 8a838743902b..eda38aa95f28 100644 --- a/intl/components/src/LocaleCanonicalizer.cpp +++ b/intl/components/src/LocaleCanonicalizer.cpp @@ -11,7 +11,7 @@ namespace mozilla::intl { /* static */ ICUResult LocaleCanonicalizer::CanonicalizeICULevel1( const char* aLocaleIn, LocaleCanonicalizer::Vector& aLocaleOut) { - auto result = FillBufferWithICUCall( + auto result = FillVectorWithICUCall( aLocaleOut, [&aLocaleIn](char* target, int32_t length, UErrorCode* status) { return uloc_canonicalize(aLocaleIn, target, length, status); diff --git a/intl/components/src/RelativeTimeFormat.h b/intl/components/src/RelativeTimeFormat.h index cf300b7bbae4..a31a4fc2cf84 100644 --- a/intl/components/src/RelativeTimeFormat.h +++ b/intl/components/src/RelativeTimeFormat.h @@ -91,7 +91,7 @@ class RelativeTimeFormat final { if constexpr (std::is_same::value) { mozilla::Vector u16Vec; - MOZ_TRY(FillBufferWithICUCall( + MOZ_TRY(FillVectorWithICUCall( u16Vec, [this, aNumber, aUnit, fmt](UChar* target, int32_t length, UErrorCode* status) { return fmt(mFormatter, aNumber, ToURelativeDateTimeUnit(aUnit), diff --git a/intl/components/src/TimeZone.cpp b/intl/components/src/TimeZone.cpp index db5ebebd6636..b17d0ccbcd67 100644 --- a/intl/components/src/TimeZone.cpp +++ b/intl/components/src/TimeZone.cpp @@ -268,14 +268,14 @@ Result TimeZone::SetDefaultTimeZone( // Retrieve the current default time zone in case we need to restore it. TimeZoneIdentifierVector defaultTimeZone; - MOZ_TRY(FillBufferWithICUCall(defaultTimeZone, ucal_getDefaultTimeZone)); + MOZ_TRY(FillVectorWithICUCall(defaultTimeZone, ucal_getDefaultTimeZone)); // Try to set the new time zone. MOZ_TRY(mozilla::intl::SetDefaultTimeZone(tzid)); // Check if the time zone was actually applied. TimeZoneIdentifierVector newTimeZone; - MOZ_TRY(FillBufferWithICUCall(newTimeZone, ucal_getDefaultTimeZone)); + MOZ_TRY(FillVectorWithICUCall(newTimeZone, ucal_getDefaultTimeZone)); // Return if the new time zone was successfully applied. if (!IsUnknownTimeZone(newTimeZone)) { @@ -296,7 +296,7 @@ ICUResult TimeZone::SetDefaultTimeZoneFromHostTimeZone() { } #else TimeZoneIdentifierVector hostTimeZone; - MOZ_TRY(FillBufferWithICUCall(hostTimeZone, ucal_getHostTimeZone)); + MOZ_TRY(FillVectorWithICUCall(hostTimeZone, ucal_getHostTimeZone)); MOZ_TRY(mozilla::intl::SetDefaultTimeZone(hostTimeZone)); #endif diff --git a/intl/docs/icu.rst b/intl/docs/icu.rst index cc017ac21925..8cf8a6e85e43 100644 --- a/intl/docs/icu.rst +++ b/intl/docs/icu.rst @@ -230,7 +230,7 @@ Use ``make_intl_data.py``\ ’s ``langtags`` mode to update language tag informa $ export PYTHONPATH="$topsrcdir/third_party/python/PyYAML/lib3/" $ python3 ./make_intl_data.py langtags -The CLDR version used will be printed in the header of CLDR-sensitive generated files. For example, ``intl/components/src/LocaleGenerated.cpp`` currently begins with: +The CLDR version used will be printed in the header of CLDR-sensitive generated files. For example, ``js/src/builtin/intl/LanguageTagGenerated.cpp`` currently begins with: .. code:: cpp diff --git a/js/public/friend/ErrorNumbers.msg b/js/public/friend/ErrorNumbers.msg index 5b895a2fb4bf..8e27ea3bda15 100644 --- a/js/public/friend/ErrorNumbers.msg +++ b/js/public/friend/ErrorNumbers.msg @@ -583,7 +583,7 @@ MSG_DEF(JSMSG_TRACELOGGER_ENABLE_FAIL, 1, JSEXN_ERR, "enabling tracelogger faile // Intl MSG_DEF(JSMSG_DATE_NOT_FINITE, 2, JSEXN_RANGEERR, "date value is not finite in {0}.{1}()") -MSG_DEF(JSMSG_DUPLICATE_VARIANT_SUBTAG, 0, JSEXN_RANGEERR, "duplicate variant subtag") +MSG_DEF(JSMSG_DUPLICATE_VARIANT_SUBTAG, 1, JSEXN_RANGEERR, "duplicate variant subtag: {0}") MSG_DEF(JSMSG_INTERNAL_INTL_ERROR, 0, JSEXN_ERR, "internal error while computing Intl data") MSG_DEF(JSMSG_INVALID_CURRENCY_CODE, 1, JSEXN_RANGEERR, "invalid currency code in NumberFormat(): {0}") MSG_DEF(JSMSG_INVALID_UNIT_IDENTIFIER, 1, JSEXN_RANGEERR, "invalid unit identifier in NumberFormat(): {0}") diff --git a/js/src/builtin/intl/Collator.cpp b/js/src/builtin/intl/Collator.cpp index c8ef58367bec..be593f603363 100644 --- a/js/src/builtin/intl/Collator.cpp +++ b/js/src/builtin/intl/Collator.cpp @@ -10,12 +10,10 @@ #include "mozilla/Assertions.h" #include "mozilla/intl/Collator.h" -#include "mozilla/intl/Locale.h" #include "mozilla/Span.h" #include "builtin/Array.h" #include "builtin/intl/CommonFunctions.h" -#include "builtin/intl/FormatBuffer.h" #include "builtin/intl/LanguageTag.h" #include "builtin/intl/ScopedICUObject.h" #include "builtin/intl/SharedIntlData.h" @@ -243,12 +241,9 @@ static mozilla::intl::Collator* NewIntlCollator( } if (StringEqualsLiteral(usage, "search")) { // ICU expects search as a Unicode locale extension on locale. - mozilla::intl::Locale tag; - if (mozilla::intl::LocaleParser::tryParse( - mozilla::MakeStringSpan(locale.get()), tag) - .isErr()) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_INVALID_LANGUAGE_TAG, locale.get()); + intl::LanguageTag tag(cx); + if (!intl::LanguageTagParser::parse( + cx, mozilla::MakeStringSpan(locale.get()), tag)) { return nullptr; } @@ -266,13 +261,7 @@ static mozilla::intl::Collator* NewIntlCollator( return nullptr; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - - locale = buffer.extractStringZ(); + locale = tag.toStringZ(cx); if (!locale) { return nullptr; } diff --git a/js/src/builtin/intl/DateTimeFormat.cpp b/js/src/builtin/intl/DateTimeFormat.cpp index 981cc64d07c9..6eb6d8940b04 100644 --- a/js/src/builtin/intl/DateTimeFormat.cpp +++ b/js/src/builtin/intl/DateTimeFormat.cpp @@ -13,7 +13,6 @@ #include "mozilla/intl/Calendar.h" #include "mozilla/intl/DateTimeFormat.h" #include "mozilla/intl/DateTimePatternGenerator.h" -#include "mozilla/intl/Locale.h" #include "mozilla/intl/TimeZone.h" #include "mozilla/Range.h" #include "mozilla/Span.h" @@ -521,14 +520,14 @@ static UniqueChars DateTimeFormatLocale( // ICU expects calendar, numberingSystem, and hourCycle as Unicode locale // extensions on locale. - mozilla::intl::Locale tag; + intl::LanguageTag tag(cx); { - RootedLinearString locale(cx, value.toString()->ensureLinear(cx)); + JSLinearString* locale = value.toString()->ensureLinear(cx); if (!locale) { return nullptr; } - if (!intl::ParseLocale(cx, locale, tag)) { + if (!intl::LanguageTagParser::parse(cx, locale, tag)) { return nullptr; } } @@ -596,12 +595,7 @@ static UniqueChars DateTimeFormatLocale( return nullptr; } - FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - return buffer.extractStringZ(); + return tag.toStringZ(cx); } static bool AssignTextComponent( diff --git a/js/src/builtin/intl/DisplayNames.cpp b/js/src/builtin/intl/DisplayNames.cpp index d9c082557d97..844c79a770ab 100644 --- a/js/src/builtin/intl/DisplayNames.cpp +++ b/js/src/builtin/intl/DisplayNames.cpp @@ -10,7 +10,6 @@ #include "mozilla/Assertions.h" #include "mozilla/intl/DateTimePatternGenerator.h" -#include "mozilla/intl/Locale.h" #include "mozilla/Span.h" #include "mozilla/TextUtils.h" @@ -22,11 +21,9 @@ #include "jspubtd.h" #include "builtin/intl/CommonFunctions.h" -#include "builtin/intl/FormatBuffer.h" #include "builtin/intl/LanguageTag.h" #include "builtin/intl/ScopedICUObject.h" #include "builtin/intl/SharedIntlData.h" -#include "builtin/intl/StringAsciiChars.h" #include "builtin/String.h" #include "gc/AllocKind.h" #include "gc/FreeOp.h" @@ -73,8 +70,6 @@ using namespace js; using js::intl::CallICU; using js::intl::IcuLocale; -using mozilla::intl::LocaleParser; - const JSClassOps DisplayNamesObject::classOps_ = {nullptr, /* addProperty */ nullptr, /* delProperty */ nullptr, /* enumerate */ @@ -337,54 +332,27 @@ static void ReportInvalidOptionError(JSContext* cx, const char* type, } } -static bool TryParseBaseName(JSContext* cx, HandleLinearString languageStr, - mozilla::intl::Locale& tag) { - if (StringIsAscii(languageStr)) { - intl::StringAsciiChars chars(languageStr); - if (!chars.init(cx)) { - return false; - } - - if (LocaleParser::tryParseBaseName(chars, tag).isOk()) { - return true; - } - } - - ReportInvalidOptionError(cx, "language", languageStr); - return false; -} - static JSString* GetLanguageDisplayName( JSContext* cx, Handle displayNames, const char* locale, DisplayNamesStyle displayStyle, DisplayNamesLanguageDisplay languageDisplay, DisplayNamesFallback fallback, HandleLinearString languageStr) { - mozilla::intl::Locale tag; - if (!TryParseBaseName(cx, languageStr, tag)) { + bool ok; + intl::LanguageTag tag(cx); + JS_TRY_VAR_OR_RETURN_NULL( + cx, ok, intl::LanguageTagParser::tryParseBaseName(cx, languageStr, tag)); + if (!ok) { + ReportInvalidOptionError(cx, "language", languageStr); return nullptr; } // ICU always canonicalizes the input locale, but since we know that ICU's // canonicalization is incomplete, we need to perform our own canonicalization // to ensure consistent result. - if (auto result = tag.canonicalizeBaseName(); result.isErr()) { - if (result.unwrapErr() == - mozilla::intl::Locale::CanonicalizationError::DuplicateVariant) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_DUPLICATE_VARIANT_SUBTAG); - } else { - intl::ReportInternalError(cx); - } - + if (!tag.canonicalizeBaseName(cx)) { return nullptr; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - - UniqueChars languageChars = buffer.extractStringZ(); + UniqueChars languageChars = tag.toStringZ(cx); if (!languageChars) { return nullptr; } @@ -426,24 +394,22 @@ static JSString* GetScriptDisplayName(JSContext* cx, DisplayNamesStyle displayStyle, DisplayNamesFallback fallback, HandleLinearString scriptStr) { - mozilla::intl::ScriptSubtag script; + intl::ScriptSubtag script; if (!intl::ParseStandaloneScriptTag(scriptStr, script)) { ReportInvalidOptionError(cx, "script", scriptStr); return nullptr; } - mozilla::intl::Locale tag; + intl::LanguageTag tag(cx); tag.setLanguage("und"); tag.setScript(script); // ICU always canonicalizes the input locale, but since we know that ICU's // canonicalization is incomplete, we need to perform our own canonicalization // to ensure consistent result. - if (tag.canonicalizeBaseName().isErr()) { - intl::ReportInternalError(cx); + if (!tag.canonicalizeBaseName(cx)) { return nullptr; } - MOZ_ASSERT(tag.script().present()); // |uldn_scriptDisplayName| doesn't use the stand-alone form for script @@ -453,13 +419,7 @@ static JSString* GetScriptDisplayName(JSContext* cx, // ICU bug: https://unicode-org.atlassian.net/browse/ICU-9301 if (displayStyle == DisplayNamesStyle::Long) { // |uloc_getDisplayScript| expects a full locale identifier as its input. - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - - UniqueChars scriptChars = buffer.extractStringZ(); + UniqueChars scriptChars = tag.toStringZ(cx); if (!scriptChars) { return nullptr; } @@ -492,9 +452,9 @@ static JSString* GetScriptDisplayName(JSContext* cx, } // Note: ICU requires the script subtag to be in canonical case. - const mozilla::intl::ScriptSubtag& canonicalScript = tag.script(); + const intl::ScriptSubtag& canonicalScript = tag.script(); - char scriptChars[mozilla::intl::LanguageTagLimits::ScriptLength + 1] = {}; + char scriptChars[intl::LanguageTagLimits::ScriptLength + 1] = {}; std::copy_n(canonicalScript.span().data(), canonicalScript.length(), scriptChars); @@ -535,30 +495,28 @@ static JSString* GetRegionDisplayName(JSContext* cx, DisplayNamesStyle displayStyle, DisplayNamesFallback fallback, HandleLinearString regionStr) { - mozilla::intl::RegionSubtag region; + intl::RegionSubtag region; if (!intl::ParseStandaloneRegionTag(regionStr, region)) { ReportInvalidOptionError(cx, "region", regionStr); return nullptr; } - mozilla::intl::Locale tag; + intl::LanguageTag tag(cx); tag.setLanguage("und"); tag.setRegion(region); // ICU always canonicalizes the input locale, but since we know that ICU's // canonicalization is incomplete, we need to perform our own canonicalization // to ensure consistent result. - if (tag.canonicalizeBaseName().isErr()) { - intl::ReportInternalError(cx); + if (!tag.canonicalizeBaseName(cx)) { return nullptr; } - MOZ_ASSERT(tag.region().present()); // Note: ICU requires the region subtag to be in canonical case. - const mozilla::intl::RegionSubtag& canonicalRegion = tag.region(); + const intl::RegionSubtag& canonicalRegion = tag.region(); - char regionChars[mozilla::intl::LanguageTagLimits::RegionLength + 1] = {}; + char regionChars[intl::LanguageTagLimits::RegionLength + 1] = {}; std::copy_n(canonicalRegion.span().data(), canonicalRegion.length(), regionChars); @@ -655,26 +613,21 @@ static JSString* GetCalendarDisplayName( DisplayNamesStyle displayStyle, DisplayNamesFallback fallback, HandleLinearString calendarStr) { // Report an error if the input can't be parsed as a Unicode type nonterminal. - if (calendarStr->empty() || !StringIsAscii(calendarStr)) { + if (calendarStr->empty() || + !intl::LanguageTagParser::canParseUnicodeExtensionType(calendarStr)) { ReportInvalidOptionError(cx, "calendar", calendarStr); return nullptr; } + MOZ_ASSERT(StringIsAscii(calendarStr), "Unicode extension types are ASCII"); + UniqueChars calendar = EncodeAscii(cx, calendarStr); if (!calendar) { return nullptr; } - if (LocaleParser::canParseUnicodeExtensionType( - mozilla::Span(calendar.get(), calendarStr->length())) - .isErr()) { - ReportInvalidOptionError(cx, "calendar", calendarStr); - return nullptr; - } - // Convert into canonical case before searching for replacements. - mozilla::intl::AsciiToLowerCase(calendar.get(), calendarStr->length(), - calendar.get()); + intl::AsciiToLowerCase(calendar.get(), calendarStr->length(), calendar.get()); auto key = mozilla::MakeStringSpan("ca"); auto type = mozilla::Span(calendar.get(), calendarStr->length()); @@ -682,7 +635,7 @@ static JSString* GetCalendarDisplayName( // Search if there's a replacement for the Unicode calendar keyword. const char* canonicalCalendar = calendar.get(); if (const char* replacement = - mozilla::intl::Locale::replaceUnicodeExtensionType(key, type)) { + intl::LanguageTag::replaceUnicodeExtensionType(key, type)) { canonicalCalendar = replacement; } @@ -779,10 +732,9 @@ static ListObject* GetDateTimeDisplayNames( return names; } - mozilla::intl::Locale tag; - if (LocaleParser::tryParse(mozilla::MakeStringSpan(locale), tag).isErr()) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_INVALID_LANGUAGE_TAG, locale); + intl::LanguageTag tag(cx); + if (!intl::LanguageTagParser::parse(cx, mozilla::MakeStringSpan(locale), + tag)) { return nullptr; } @@ -795,12 +747,7 @@ static ListObject* GetDateTimeDisplayNames( return nullptr; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - UniqueChars localeWithCalendar = buffer.extractStringZ(); + UniqueChars localeWithCalendar = tag.toStringZ(cx); if (!localeWithCalendar) { return nullptr; } diff --git a/js/src/builtin/intl/FormatBuffer.h b/js/src/builtin/intl/FormatBuffer.h index e5ac1b77f063..582eb076ed7d 100644 --- a/js/src/builtin/intl/FormatBuffer.h +++ b/js/src/builtin/intl/FormatBuffer.h @@ -8,6 +8,7 @@ #define builtin_intl_FormatBuffer_h #include "mozilla/Assertions.h" +#include "mozilla/Range.h" #include "mozilla/Span.h" #include @@ -15,7 +16,6 @@ #include "gc/Allocator.h" #include "js/AllocPolicy.h" -#include "js/CharacterEncoding.h" #include "js/TypeDecls.h" #include "js/UniquePtr.h" #include "js/Vector.h" @@ -89,7 +89,8 @@ class FormatBuffer { std::is_same_v) { // Handle the UTF-8 encoding case. return NewStringCopyUTF8N( - cx, JS::UTF8Chars(buffer_.begin(), buffer_.length())); + cx, mozilla::Range(reinterpret_cast(buffer_.begin()), + buffer_.length())); } else { // Handle the UTF-16 encoding case. static_assert(std::is_same_v); diff --git a/js/src/builtin/intl/IntlObject.cpp b/js/src/builtin/intl/IntlObject.cpp index 63c1f60894be..4b4d347ce168 100644 --- a/js/src/builtin/intl/IntlObject.cpp +++ b/js/src/builtin/intl/IntlObject.cpp @@ -32,7 +32,6 @@ #include "builtin/intl/PluralRules.h" #include "builtin/intl/RelativeTimeFormat.h" #include "builtin/intl/SharedIntlData.h" -#include "builtin/intl/StringAsciiChars.h" #include "ds/Sort.h" #include "js/CharacterEncoding.h" #include "js/Class.h" @@ -261,54 +260,29 @@ bool js::intl_BestAvailableLocale(JSContext* cx, unsigned argc, Value* vp) { #ifdef DEBUG { - MOZ_ASSERT(StringIsAscii(locale), "language tags are ASCII-only"); - - // |locale| is a structurally valid language tag. - mozilla::intl::Locale tag; - - using ParserError = mozilla::intl::LocaleParser::ParserError; - mozilla::Result parse_result = Ok(); - { - intl::StringAsciiChars chars(locale); - if (!chars.init(cx)) { - return false; - } - - parse_result = mozilla::intl::LocaleParser::tryParse(chars, tag); - } - - if (parse_result.isErr()) { - MOZ_ASSERT(parse_result.unwrapErr() == ParserError::OutOfMemory, - "locale is a structurally valid language tag"); - - intl::ReportInternalError(cx); - return false; - } + intl::LanguageTag tag(cx); + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, ok, intl::LanguageTagParser::tryParse(cx, locale, tag)); + MOZ_ASSERT(ok, "locale is a structurally valid language tag"); MOZ_ASSERT(!tag.unicodeExtension(), "locale must contain no Unicode extensions"); - if (auto result = tag.canonicalize(); result.isErr()) { - MOZ_ASSERT( - result.unwrapErr() != - mozilla::intl::Locale::CanonicalizationError::DuplicateVariant); - intl::ReportInternalError(cx); + if (!tag.canonicalize(cx)) { return false; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return false; - } - - JSLinearString* tagStr = buffer.toString(cx); + JSString* tagStr = tag.toString(cx); if (!tagStr) { return false; } - MOZ_ASSERT(EqualStrings(locale, tagStr), - "locale is a canonicalized language tag"); + bool canonical; + if (!EqualStrings(cx, locale, tagStr, &canonical)) { + return false; + } + MOZ_ASSERT(canonical, "locale is a canonicalized language tag"); } #endif @@ -344,47 +318,39 @@ bool js::intl_supportedLocaleOrFallback(JSContext* cx, unsigned argc, return false; } - mozilla::intl::Locale tag; - bool canParseLocale = false; - if (StringIsAscii(locale)) { - intl::StringAsciiChars chars(locale); - if (!chars.init(cx)) { - return false; - } - - // Tell the analysis the |tag.canonicalize()| method can't GC. - JS::AutoSuppressGCAnalysis nogc; - - canParseLocale = mozilla::intl::LocaleParser::tryParse(chars, tag).isOk() && - tag.canonicalize().isOk(); - } + intl::LanguageTag tag(cx); + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, ok, intl::LanguageTagParser::tryParse(cx, locale, tag)); RootedLinearString candidate(cx); - if (!canParseLocale) { + if (!ok) { candidate = NewStringCopyZ(cx, intl::LastDitchLocale()); if (!candidate) { return false; } } else { + if (!tag.canonicalize(cx)) { + return false; + } + // The default locale must be in [[AvailableLocales]], and that list must // not contain any locales with Unicode extension sequences, so remove any // present in the candidate. tag.clearUnicodeExtension(); - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); + JSString* canonical = tag.toString(cx); + if (!canonical) { return false; } - candidate = buffer.toString(cx); + candidate = canonical->ensureLinear(cx); if (!candidate) { return false; } - // Certain old-style language tags lack a script code, but in current - // usage they *would* include a script code. Map these over to modern - // forms. + // Certain old-style language tags lack a script code, but in current usage + // they *would* include a script code. Map these over to modern forms. for (const auto& mapping : js::intl::oldStyleLanguageTagMappings) { const char* oldStyle = mapping.oldStyle; const char* modernStyle = mapping.modernStyle; @@ -404,8 +370,8 @@ bool js::intl_supportedLocaleOrFallback(JSContext* cx, unsigned argc, // - [[AvailableLocales]] is a List [...]. The list must include the value // returned by the DefaultLocale abstract operation (6.2.4), [...]. // - // That implies we must ignore any candidate which isn't supported by all - // Intl service constructors. + // That implies we must ignore any candidate which isn't supported by all Intl + // service constructors. RootedLinearString supportedCollator(cx); JS_TRY_VAR_OR_RETURN_FALSE( @@ -421,8 +387,8 @@ bool js::intl_supportedLocaleOrFallback(JSContext* cx, unsigned argc, #ifdef DEBUG // Note: We don't test the supported locales of the remaining Intl service - // constructors, because the set of supported locales is exactly equal to - // the set of supported locales of Intl.DateTimeFormat. + // constructors, because the set of supported locales is exactly equal to the + // set of supported locales of Intl.DateTimeFormat. for (auto kind : {SupportedLocaleKind::DisplayNames, SupportedLocaleKind::ListFormat, SupportedLocaleKind::NumberFormat, SupportedLocaleKind::PluralRules, @@ -592,10 +558,9 @@ static ArrayObject* AvailableCalendars(JSContext* cx) { Rooted list(cx, StringList(cx)); { - // Hazard analysis complains that the mozilla::Result destructor calls a - // GC function, which is unsound when returning an unrooted value. Work - // around this issue by restricting the lifetime of |keywords| to a - // separate block. + // Hazard analysis complains that the mozilla::Result destructor calls a GC + // function, which is unsound when returning an unrooted value. Work around + // this issue by restricting the lifetime of |keywords| to a separate block. auto keywords = mozilla::intl::Calendar::GetBcp47KeywordValuesForLocale(""); if (keywords.isErr()) { intl::ReportInternalError(cx, keywords.unwrapErr()); @@ -634,10 +599,9 @@ static ArrayObject* AvailableCollations(JSContext* cx) { Rooted list(cx, StringList(cx)); { - // Hazard analysis complains that the mozilla::Result destructor calls a - // GC function, which is unsound when returning an unrooted value. Work - // around this issue by restricting the lifetime of |keywords| to a - // separate block. + // Hazard analysis complains that the mozilla::Result destructor calls a GC + // function, which is unsound when returning an unrooted value. Work around + // this issue by restricting the lifetime of |keywords| to a separate block. auto keywords = mozilla::intl::Collator::GetBcp47KeywordValues(); if (keywords.isErr()) { intl::ReportInternalError(cx, keywords.unwrapErr()); @@ -653,15 +617,13 @@ static ArrayObject* AvailableCollations(JSContext* cx) { // |ucol_getKeywordValues| returns the possible collations for all installed // locales. The root locale is excluded in the list of installed locales, so - // we have to explicitly request the available collations of the root - // locale. + // we have to explicitly request the available collations of the root locale. // // https://unicode-org.atlassian.net/browse/ICU-21641 { - // Hazard analysis complains that the mozilla::Result destructor calls a - // GC function, which is unsound when returning an unrooted value. Work - // around this issue by restricting the lifetime of |keywords| to a - // separate block. + // Hazard analysis complains that the mozilla::Result destructor calls a GC + // function, which is unsound when returning an unrooted value. Work around + // this issue by restricting the lifetime of |keywords| to a separate block. auto keywords = mozilla::intl::Collator::GetBcp47KeywordValuesForLocale(""); if (keywords.isErr()) { intl::ReportInternalError(cx, keywords.unwrapErr()); @@ -715,10 +677,9 @@ static ArrayObject* AvailableCurrencies(JSContext* cx) { Rooted list(cx, StringList(cx)); { - // Hazard analysis complains that the mozilla::Result destructor calls a - // GC function, which is unsound when returning an unrooted value. Work - // around this issue by restricting the lifetime of |keywords| to a - // separate block. + // Hazard analysis complains that the mozilla::Result destructor calls a GC + // function, which is unsound when returning an unrooted value. Work around + // this issue by restricting the lifetime of |keywords| to a separate block. auto currencies = mozilla::intl::Currency::GetISOCurrencies(); if (currencies.isErr()) { intl::ReportInternalError(cx, currencies.unwrapErr()); @@ -762,8 +723,7 @@ static ArrayObject* AvailableNumberingSystems(JSContext* cx) { * AvailableTimeZones ( ) */ static ArrayObject* AvailableTimeZones(JSContext* cx) { - // Unsorted list of canonical time zone names, possibly containing - // duplicates. + // Unsorted list of canonical time zone names, possibly containing duplicates. Rooted timeZones(cx, StringList(cx)); intl::SharedIntlData& sharedIntlData = cx->runtime()->sharedIntlData.ref(); @@ -780,8 +740,8 @@ static ArrayObject* AvailableTimeZones(JSContext* cx) { // Canonicalize the time zone before adding it to the result array. - // Some time zone names are canonicalized differently by ICU -- handle - // those first. + // Some time zone names are canonicalized differently by ICU -- handle those + // first. ianaTimeZone.set(nullptr); if (!sharedIntlData.tryCanonicalizeTimeZoneConsistentWithIANA( cx, validatedTimeZone, &ianaTimeZone)) { diff --git a/js/src/builtin/intl/LanguageTag.cpp b/js/src/builtin/intl/LanguageTag.cpp index d8a29083af43..c0e833cc659d 100644 --- a/js/src/builtin/intl/LanguageTag.cpp +++ b/js/src/builtin/intl/LanguageTag.cpp @@ -6,52 +6,1587 @@ #include "builtin/intl/LanguageTag.h" -#include "mozilla/intl/Locale.h" +#include "mozilla/Assertions.h" +#include "mozilla/DebugOnly.h" +#include "mozilla/MathAlgorithms.h" #include "mozilla/Span.h" +#include "mozilla/TextUtils.h" +#include "mozilla/Variant.h" -#include "builtin/intl/StringAsciiChars.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "builtin/intl/CommonFunctions.h" +#include "ds/Sort.h" #include "gc/Tracer.h" -#include "js/CharacterEncoding.h" +#include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_* +#include "js/Result.h" #include "js/TracingAPI.h" +#include "js/Utility.h" +#include "js/Vector.h" +#include "unicode/uloc.h" +#include "unicode/utypes.h" +#include "util/StringBuffer.h" +#include "util/Text.h" #include "vm/JSContext.h" +#include "vm/Printer.h" +#include "vm/StringType.h" namespace js { namespace intl { -[[nodiscard]] bool ParseLocale(JSContext* cx, HandleLinearString str, - mozilla::intl::Locale& result) { - if (StringIsAscii(str)) { - intl::StringAsciiChars chars(str); - if (!chars.init(cx)) { +using namespace js::intl::LanguageTagLimits; + +template +bool IsStructurallyValidLanguageTag(mozilla::Span language) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + // unicode_language_subtag = alpha{2,3} | alpha{5,8}; + size_t length = language.size(); + const CharT* str = language.data(); + return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && + std::all_of(str, str + length, mozilla::IsAsciiAlpha); +} + +template bool IsStructurallyValidLanguageTag( + mozilla::Span language); +template bool IsStructurallyValidLanguageTag( + mozilla::Span language); +template bool IsStructurallyValidLanguageTag( + mozilla::Span language); + +template +bool IsStructurallyValidScriptTag(mozilla::Span script) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + // unicode_script_subtag = alpha{4} ; + size_t length = script.size(); + const CharT* str = script.data(); + return length == 4 && + std::all_of(str, str + length, mozilla::IsAsciiAlpha); +} + +template bool IsStructurallyValidScriptTag(mozilla::Span script); +template bool IsStructurallyValidScriptTag( + mozilla::Span script); +template bool IsStructurallyValidScriptTag( + mozilla::Span script); + +template +bool IsStructurallyValidRegionTag(mozilla::Span region) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + size_t length = region.size(); + const CharT* str = region.data(); + return (length == 2 && + std::all_of(str, str + length, mozilla::IsAsciiAlpha)) || + (length == 3 && + std::all_of(str, str + length, mozilla::IsAsciiDigit)); +} + +template bool IsStructurallyValidRegionTag(mozilla::Span region); +template bool IsStructurallyValidRegionTag( + mozilla::Span region); +template bool IsStructurallyValidRegionTag( + mozilla::Span region); + +#ifdef DEBUG +bool IsStructurallyValidVariantTag(mozilla::Span variant) { + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + size_t length = variant.size(); + const char* str = variant.data(); + return ((5 <= length && length <= 8) || + (length == 4 && mozilla::IsAsciiDigit(str[0]))) && + std::all_of(str, str + length, mozilla::IsAsciiAlphanumeric); +} + +bool IsStructurallyValidUnicodeExtensionTag( + mozilla::Span extension) { + return LanguageTagParser::canParseUnicodeExtension(extension); +} + +static bool IsStructurallyValidExtensionTag( + mozilla::Span extension) { + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + // NB: Allow any extension, including Unicode and Transform here, because + // this function is only used for an assertion. + + size_t length = extension.size(); + const char* str = extension.data(); + const char* const end = extension.data() + length; + if (length <= 2) { + return false; + } + if (!mozilla::IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') { + return false; + } + str++; + if (*str++ != '-') { + return false; + } + while (true) { + const char* sep = + reinterpret_cast(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; + if (len < 2 || len > 8 || + !std::all_of(str, str + len, mozilla::IsAsciiAlphanumeric)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} + +bool IsStructurallyValidPrivateUseTag(mozilla::Span privateUse) { + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + + size_t length = privateUse.size(); + const char* str = privateUse.data(); + const char* const end = privateUse.data() + length; + if (length <= 2) { + return false; + } + if (str[0] != 'x' && str[0] != 'X') { + return false; + } + str++; + if (*str++ != '-') { + return false; + } + while (true) { + const char* sep = + reinterpret_cast(memchr(str, '-', end - str)); + size_t len = (sep ? sep : end) - str; + if (len == 0 || len > 8 || + !std::all_of(str, str + len, mozilla::IsAsciiAlphanumeric)) { + return false; + } + if (!sep) { + return true; + } + str = sep + 1; + } +} +#endif + +ptrdiff_t LanguageTag::unicodeExtensionIndex() const { + // The extension subtags aren't necessarily sorted, so we can't use binary + // search here. + auto p = std::find_if( + extensions().begin(), extensions().end(), + [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; }); + if (p != extensions().end()) { + return std::distance(extensions().begin(), p); + } + return -1; +} + +const char* LanguageTag::unicodeExtension() const { + ptrdiff_t index = unicodeExtensionIndex(); + if (index >= 0) { + return extensions()[index].get(); + } + return nullptr; +} + +bool LanguageTag::setUnicodeExtension(UniqueChars extension) { + MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag( + mozilla::MakeStringSpan(extension.get()))); + + // Replace the existing Unicode extension subtag or append a new one. + ptrdiff_t index = unicodeExtensionIndex(); + if (index >= 0) { + extensions_[index] = std::move(extension); + return true; + } + return extensions_.append(std::move(extension)); +} + +void LanguageTag::clearUnicodeExtension() { + ptrdiff_t index = unicodeExtensionIndex(); + if (index >= 0) { + extensions_.erase(extensions_.begin() + index); + } +} + +template +static bool SortAlphabetically(JSContext* cx, + Vector& subtags) { + size_t length = subtags.length(); + + // Zero or one element lists are already sorted. + if (length < 2) { + return true; + } + + // Handle two element lists inline. + if (length == 2) { + if (strcmp(subtags[0].get(), subtags[1].get()) > 0) { + subtags[0].swap(subtags[1]); + } + return true; + } + + Vector scratch(cx); + if (!scratch.resizeUninitialized(length * 2)) { + return false; + } + for (size_t i = 0; i < length; i++) { + scratch[i] = subtags[i].release(); + } + + MOZ_ALWAYS_TRUE( + MergeSort(scratch.begin(), length, scratch.begin() + length, + [](const char* a, const char* b, bool* lessOrEqualp) { + *lessOrEqualp = strcmp(a, b) <= 0; + return true; + })); + + for (size_t i = 0; i < length; i++) { + subtags[i] = UniqueChars(scratch[i]); + } + return true; +} + +bool LanguageTag::canonicalizeBaseName(JSContext* cx) { + // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to + // canonicalize the syntax by normalizing the case and ordering all subtags. + // The canonical syntax form is specified in UTS 35, 3.2.1. + + // Language codes need to be in lower case. "JA" -> "ja" + language_.toLowerCase(); + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); + + // The first character of a script code needs to be capitalized. + // "hans" -> "Hans" + script_.toTitleCase(); + MOZ_ASSERT(script().missing() || + IsStructurallyValidScriptTag(script().span())); + + // Region codes need to be in upper case. "bu" -> "BU" + region_.toUpperCase(); + MOZ_ASSERT(region().missing() || + IsStructurallyValidRegionTag(region().span())); + + // The canonical case for variant subtags is lowercase. + for (UniqueChars& variant : variants_) { + char* variantChars = variant.get(); + size_t variantLength = strlen(variantChars); + AsciiToLowerCase(variantChars, variantLength, variantChars); + + MOZ_ASSERT(IsStructurallyValidVariantTag({variantChars, variantLength})); + } + + // Extensions and privateuse subtags are case normalized in the + // |canonicalizeExtensions| method. + + // The second step in UTS 35, 3.2.1, is to order all subtags. + + if (variants_.length() > 1) { + // 1. Any variants are in alphabetical order. + if (!SortAlphabetically(cx, variants_)) { return false; } - if (mozilla::intl::LocaleParser::tryParse(chars, result).isOk()) { - return true; + // Reject the Locale identifier if a duplicate variant was found, e.g. + // "en-variant-Variant". + const UniqueChars* duplicate = std::adjacent_find( + variants().begin(), variants().end(), [](const auto& a, const auto& b) { + return strcmp(a.get(), b.get()) == 0; + }); + if (duplicate != variants().end()) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_DUPLICATE_VARIANT_SUBTAG, + duplicate->get()); + return false; } } - if (UniqueChars localeChars = QuoteString(cx, str, '"')) { + // 2. Any extensions are in alphabetical order by their singleton. + // 3. All attributes are sorted in alphabetical order. + // 4. All keywords and tfields are sorted by alphabetical order of their keys, + // within their respective extensions. + // 5. Any type or tfield value "true" is removed. + // - A subsequent call to canonicalizeExtensions() will perform these steps. + + // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier + // into its canonical form per UTS 3.2.1. + + // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their + // canonical forms. + // - A subsequent call to canonicalizeExtensions() will perform this step. + + // 2. Replace aliases in the unicode_language_id and tlang (if any). + // - tlang is handled in canonicalizeExtensions(). + + // Replace deprecated language, region, and variant subtags with their + // preferred mappings. + + if (!updateLegacyMappings(cx)) { + return false; + } + + // Replace deprecated language subtags with their preferred values. + if (!languageMapping(language_) && complexLanguageMapping(language_)) { + performComplexLanguageMappings(); + } + + // Replace deprecated script subtags with their preferred values. + if (script().present()) { + scriptMapping(script_); + } + + // Replace deprecated region subtags with their preferred values. + if (region().present()) { + if (!regionMapping(region_) && complexRegionMapping(region_)) { + performComplexRegionMappings(); + } + } + + // Replace deprecated variant subtags with their preferred values. + if (!performVariantMappings(cx)) { + return false; + } + + // No extension replacements are currently present. + // Private use sequences are left as is. + + // 3. Replace aliases in special key values. + // - A subsequent call to canonicalizeExtensions() will perform this step. + + return true; +} + +#ifdef DEBUG +template +static bool IsAsciiLowercaseAlphanumericOrDash( + mozilla::Span span) { + const CharT* ptr = span.data(); + size_t length = span.size(); + return std::all_of(ptr, ptr + length, [](auto c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c) || + c == '-'; + }); +} +#endif + +bool LanguageTag::canonicalizeExtensions(JSContext* cx) { + // The canonical case for all extension subtags is lowercase. + for (UniqueChars& extension : extensions_) { + char* extensionChars = extension.get(); + size_t extensionLength = strlen(extensionChars); + AsciiToLowerCase(extensionChars, extensionLength, extensionChars); + + MOZ_ASSERT( + IsStructurallyValidExtensionTag({extensionChars, extensionLength})); + } + + // Any extensions are in alphabetical order by their singleton. + // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" + if (!SortAlphabetically(cx, extensions_)) { + return false; + } + + for (UniqueChars& extension : extensions_) { + if (extension[0] == 'u') { + if (!canonicalizeUnicodeExtension(cx, extension)) { + return false; + } + } else if (extension[0] == 't') { + if (!canonicalizeTransformExtension(cx, extension)) { + return false; + } + } + + MOZ_ASSERT(IsAsciiLowercaseAlphanumericOrDash( + mozilla::MakeStringSpan(extension.get()))); + } + + // The canonical case for privateuse subtags is lowercase. + if (char* privateuse = privateuse_.get()) { + size_t privateuseLength = strlen(privateuse); + AsciiToLowerCase(privateuse, privateuseLength, privateuse); + + MOZ_ASSERT( + IsStructurallyValidPrivateUseTag({privateuse, privateuseLength})); + } + return true; +} + +/** + * CanonicalizeUnicodeExtension( attributes, keywords ) + * + * Canonical syntax per + * : + * + * - All attributes and keywords are in lowercase. + * - Note: The parser already converted keywords to lowercase. + * - All attributes are sorted in alphabetical order. + * - All keywords are sorted by alphabetical order of their keys. + * - Any type value "true" is removed. + * + * Canonical form: + * - All keys and types use the canonical form (from the name attribute; + * see Section 3.6.4 U Extension Data Files). + */ +bool LanguageTag::canonicalizeUnicodeExtension( + JSContext* cx, JS::UniqueChars& unicodeExtension) { + const char* const extension = unicodeExtension.get(); + MOZ_ASSERT(extension[0] == 'u'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT( + IsStructurallyValidExtensionTag(mozilla::MakeStringSpan(extension))); + + size_t length = strlen(extension); + + LanguageTagParser::AttributesVector attributes(cx); + LanguageTagParser::KeywordsVector keywords(cx); + + using Attribute = LanguageTagParser::AttributesVector::ElementType; + using Keyword = LanguageTagParser::KeywordsVector::ElementType; + + mozilla::DebugOnly ok; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, ok, + LanguageTagParser::parseUnicodeExtension( + cx, mozilla::Span(extension, length), attributes, keywords)); + MOZ_ASSERT(ok, "unexpected invalid Unicode extension subtag"); + + auto attributesLessOrEqual = [extension](const Attribute& a, + const Attribute& b) { + const char* astr = a.begin(extension); + const char* bstr = b.begin(extension); + size_t alen = a.length(); + size_t blen = b.length(); + + if (int r = + std::char_traits::compare(astr, bstr, std::min(alen, blen))) { + return r < 0; + } + return alen <= blen; + }; + + // All attributes are sorted in alphabetical order. + size_t attributesLength = attributes.length(); + if (attributesLength > 1) { + if (!attributes.growByUninitialized(attributesLength)) { + return false; + } + + MOZ_ALWAYS_TRUE( + MergeSort(attributes.begin(), attributesLength, + attributes.begin() + attributesLength, + [&](const auto& a, const auto& b, bool* lessOrEqualp) { + *lessOrEqualp = attributesLessOrEqual(a, b); + return true; + })); + + attributes.shrinkBy(attributesLength); + } + + auto keywordsLessOrEqual = [extension](const Keyword& a, const Keyword& b) { + const char* astr = a.begin(extension); + const char* bstr = b.begin(extension); + MOZ_ASSERT(a.length() >= UnicodeKeyLength); + MOZ_ASSERT(b.length() >= UnicodeKeyLength); + + return std::char_traits::compare(astr, bstr, UnicodeKeyLength) <= 0; + }; + + // All keywords are sorted by alphabetical order of keys. + size_t keywordsLength = keywords.length(); + if (keywordsLength > 1) { + if (!keywords.growByUninitialized(keywordsLength)) { + return false; + } + + // Using merge sort, being a stable sort algorithm, guarantees that two + // keywords using the same key are never reordered. That means for example + // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to + // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs + // before "nu-latn". + // This is required so that deduplication below preserves the first keyword + // for a given key and discards the rest. + MOZ_ALWAYS_TRUE(MergeSort( + keywords.begin(), keywordsLength, keywords.begin() + keywordsLength, + [&](const auto& a, const auto& b, bool* lessOrEqualp) { + *lessOrEqualp = keywordsLessOrEqual(a, b); + return true; + })); + + keywords.shrinkBy(keywordsLength); + } + + Vector sb(cx); + if (!sb.append('u')) { + return false; + } + + // Append all Unicode extension attributes. + for (size_t i = 0; i < attributes.length(); i++) { + const auto& attribute = attributes[i]; + + // Skip duplicate attributes. + if (i > 0) { + const auto& lastAttribute = attributes[i - 1]; + if (attribute.length() == lastAttribute.length() && + std::char_traits::compare(attribute.begin(extension), + lastAttribute.begin(extension), + attribute.length()) == 0) { + continue; + } + MOZ_ASSERT(!attributesLessOrEqual(attribute, lastAttribute)); + } + + if (!sb.append('-')) { + return false; + } + if (!sb.append(attribute.begin(extension), attribute.length())) { + return false; + } + } + + static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; + + using StringSpan = mozilla::Span; + + static auto isTrue = [](StringSpan type) { + static constexpr char True[] = "true"; + constexpr size_t TrueLength = std::char_traits::length(True); + return type.size() == TrueLength && + std::char_traits::compare(type.data(), True, TrueLength) == 0; + }; + + auto appendKey = [&sb, extension](const Keyword& keyword) { + MOZ_ASSERT(keyword.length() == UnicodeKeyLength); + return sb.append(keyword.begin(extension), UnicodeKeyLength); + }; + + auto appendKeyword = [&sb, extension](const Keyword& keyword, + StringSpan type) { + MOZ_ASSERT(keyword.length() > UnicodeKeyLength); + + // Elide the Unicode extension type "true". + if (isTrue(type)) { + return sb.append(keyword.begin(extension), UnicodeKeyLength); + } + // Otherwise append the complete Unicode extension keyword. + return sb.append(keyword.begin(extension), keyword.length()); + }; + + auto appendReplacement = [&sb, extension](const Keyword& keyword, + StringSpan replacement) { + MOZ_ASSERT(keyword.length() > UnicodeKeyLength); + + // Elide the type "true" if present in the replacement. + if (isTrue(replacement)) { + return sb.append(keyword.begin(extension), UnicodeKeyLength); + } + // Otherwise append the Unicode key (including the separator) and the + // replaced type. + return sb.append(keyword.begin(extension), UnicodeKeyWithSepLength) && + sb.append(replacement.data(), replacement.size()); + }; + + // Append all Unicode extension keywords. + for (size_t i = 0; i < keywords.length(); i++) { + const auto& keyword = keywords[i]; + + // Skip duplicate keywords. + if (i > 0) { + const auto& lastKeyword = keywords[i - 1]; + if (std::char_traits::compare(keyword.begin(extension), + lastKeyword.begin(extension), + UnicodeKeyLength) == 0) { + continue; + } + MOZ_ASSERT(!keywordsLessOrEqual(keyword, lastKeyword)); + } + + if (!sb.append('-')) { + return false; + } + + if (keyword.length() == UnicodeKeyLength) { + // Keyword without type value. + if (!appendKey(keyword)) { + return false; + } + } else { + StringSpan key(keyword.begin(extension), UnicodeKeyLength); + StringSpan type(keyword.begin(extension) + UnicodeKeyWithSepLength, + keyword.length() - UnicodeKeyWithSepLength); + + // Search if there's a replacement for the current Unicode keyword. + if (const char* replacement = replaceUnicodeExtensionType(key, type)) { + if (!appendReplacement(keyword, mozilla::MakeStringSpan(replacement))) { + return false; + } + } else { + if (!appendKeyword(keyword, type)) { + return false; + } + } + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (sb.length() != length || + std::char_traits::compare(sb.begin(), extension, length) != 0) { + // Null-terminate the new string and replace the previous extension. + if (!sb.append('\0')) { + return false; + } + UniqueChars canonical(sb.extractOrCopyRawBuffer()); + if (!canonical) { + return false; + } + unicodeExtension = std::move(canonical); + } + + return true; +} + +template +static bool LanguageTagToString(JSContext* cx, const LanguageTag& tag, + Buffer& sb) { + auto appendSubtag = [&sb](const auto& subtag) { + auto span = subtag.span(); + MOZ_ASSERT(!span.empty()); + return sb.append(span.data(), span.size()); + }; + + auto appendSubtagZ = [&sb](const char* subtag) { + MOZ_ASSERT(strlen(subtag) > 0); + return sb.append(subtag, strlen(subtag)); + }; + + auto appendSubtagsZ = [&sb, &appendSubtagZ](const auto& subtags) { + for (const auto& subtag : subtags) { + if (!sb.append('-') || !appendSubtagZ(subtag.get())) { + return false; + } + } + return true; + }; + + // Append the language subtag. + if (!appendSubtag(tag.language())) { + return false; + } + + // Append the script subtag if present. + if (tag.script().present()) { + if (!sb.append('-') || !appendSubtag(tag.script())) { + return false; + } + } + + // Append the region subtag if present. + if (tag.region().present()) { + if (!sb.append('-') || !appendSubtag(tag.region())) { + return false; + } + } + + // Append the variant subtags if present. + if (!appendSubtagsZ(tag.variants())) { + return false; + } + + // Append the extensions subtags if present. + if (!appendSubtagsZ(tag.extensions())) { + return false; + } + + // Append the private-use subtag if present. + if (tag.privateuse()) { + if (!sb.append('-') || !appendSubtagZ(tag.privateuse())) { + return false; + } + } + + return true; +} + +/** + * CanonicalizeTransformExtension + * + * Canonical form per : + * + * - These subtags are all in lowercase (that is the canonical casing for these + * subtags), [...]. + * + * And per + * : + * + * - All keywords and tfields are sorted by alphabetical order of their keys, + * within their respective extensions. + */ +bool LanguageTag::canonicalizeTransformExtension( + JSContext* cx, JS::UniqueChars& transformExtension) { + const char* const extension = transformExtension.get(); + MOZ_ASSERT(extension[0] == 't'); + MOZ_ASSERT(extension[1] == '-'); + MOZ_ASSERT( + IsStructurallyValidExtensionTag(mozilla::MakeStringSpan(extension))); + + size_t length = strlen(extension); + + LanguageTag tag(cx); + LanguageTagParser::TFieldVector fields(cx); + + using TField = LanguageTagParser::TFieldVector::ElementType; + + mozilla::DebugOnly ok; + JS_TRY_VAR_OR_RETURN_FALSE( + cx, ok, + LanguageTagParser::parseTransformExtension( + cx, mozilla::Span(extension, length), tag, fields)); + MOZ_ASSERT(ok, "unexpected invalid transform extension subtag"); + + auto tfieldLessOrEqual = [extension](const TField& a, const TField& b) { + MOZ_ASSERT(a.length() > TransformKeyLength); + MOZ_ASSERT(b.length() > TransformKeyLength); + const char* astr = a.begin(extension); + const char* bstr = b.begin(extension); + return std::char_traits::compare(astr, bstr, TransformKeyLength) <= 0; + }; + + // All tfields are sorted by alphabetical order of their keys. + if (size_t fieldsLength = fields.length(); fieldsLength > 1) { + if (!fields.growByUninitialized(fieldsLength)) { + return false; + } + + MOZ_ALWAYS_TRUE( + MergeSort(fields.begin(), fieldsLength, fields.begin() + fieldsLength, + [&](const auto& a, const auto& b, bool* lessOrEqualp) { + *lessOrEqualp = tfieldLessOrEqual(a, b); + return true; + })); + + fields.shrinkBy(fieldsLength); + } + + Vector sb(cx); + if (!sb.append('t')) { + return false; + } + + // Append the language subtag if present. + // + // Replace aliases in tlang per + // . + if (tag.language().present()) { + if (!sb.append('-')) { + return false; + } + + if (!tag.canonicalizeBaseName(cx)) { + return false; + } + + // The canonical case for Transform extensions is lowercase per + // . Convert the two + // subtags which don't use lowercase for their canonical syntax. + tag.script_.toLowerCase(); + tag.region_.toLowerCase(); + + if (!LanguageTagToString(cx, tag, sb)) { + return false; + } + } + + static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1; + + using StringSpan = mozilla::Span; + + // Append all fields. + // + // UTS 35, 3.2.1 specifies: + // - Any type or tfield value "true" is removed. + // + // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore + // this apparently invalid part of the UTS 35 specification and simply + // append all `tfield` subtags. + for (const auto& field : fields) { + if (!sb.append('-')) { + return false; + } + + StringSpan key(field.begin(extension), TransformKeyLength); + StringSpan value(field.begin(extension) + TransformKeyWithSepLength, + field.length() - TransformKeyWithSepLength); + + // Search if there's a replacement for the current transform keyword. + if (const char* replacement = replaceTransformExtensionType(key, value)) { + if (!sb.append(field.begin(extension), TransformKeyWithSepLength)) { + return false; + } + if (!sb.append(replacement, strlen(replacement))) { + return false; + } + } else { + if (!sb.append(field.begin(extension), field.length())) { + return false; + } + } + } + + // We can keep the previous extension when canonicalization didn't modify it. + if (sb.length() != length || + std::char_traits::compare(sb.begin(), extension, length) != 0) { + // Null-terminate the new string and replace the previous extension. + if (!sb.append('\0')) { + return false; + } + UniqueChars canonical(sb.extractOrCopyRawBuffer()); + if (!canonical) { + return false; + } + transformExtension = std::move(canonical); + } + + return true; +} + +JSString* LanguageTag::toString(JSContext* cx) const { + JSStringBuilder sb(cx); + if (!LanguageTagToString(cx, *this, sb)) { + return nullptr; + } + + return sb.finishString(); +} + +UniqueChars LanguageTag::toStringZ(JSContext* cx) const { + Vector sb(cx); + if (!LanguageTagToString(cx, *this, sb)) { + return nullptr; + } + if (!sb.append('\0')) { + return nullptr; + } + + return UniqueChars(sb.extractOrCopyRawBuffer()); +} + +// Zero-terminated ICU Locale ID. +using LocaleId = + js::Vector; + +enum class LikelySubtags : bool { Add, Remove }; + +// Return true iff the language tag is already maximized resp. minimized. +static bool HasLikelySubtags(LikelySubtags likelySubtags, + const LanguageTag& tag) { + // The language tag is already maximized if the language, script, and region + // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are + // used. + if (likelySubtags == LikelySubtags::Add) { + return !tag.language().equalTo("und") && + (tag.script().present() && !tag.script().equalTo("Zzzz")) && + (tag.region().present() && !tag.region().equalTo("ZZ")); + } + + // The language tag is already minimized if it only contains a language + // subtag whose value is not the placeholder value "und". + return !tag.language().equalTo("und") && tag.script().missing() && + tag.region().missing(); +} + +// Create an ICU locale ID from the given language tag. +static bool CreateLocaleForLikelySubtags(const LanguageTag& tag, + LocaleId& locale) { + MOZ_ASSERT(locale.length() == 0); + + auto appendSubtag = [&locale](const auto& subtag) { + auto span = subtag.span(); + MOZ_ASSERT(!span.empty()); + return locale.append(span.data(), span.size()); + }; + + // Append the language subtag. + if (!appendSubtag(tag.language())) { + return false; + } + + // Append the script subtag if present. + if (tag.script().present()) { + if (!locale.append('_') || !appendSubtag(tag.script())) { + return false; + } + } + + // Append the region subtag if present. + if (tag.region().present()) { + if (!locale.append('_') || !appendSubtag(tag.region())) { + return false; + } + } + + // Zero-terminated for use with ICU. + return locale.append('\0'); +} + +// Assign the language, script, and region subtags from an ICU locale ID. +// +// ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to +// retrieve these subtags, but unfortunately these functions are rather slow, so +// we use our own implementation. +static bool AssignFromLocaleId(JSContext* cx, LocaleId& localeId, + LanguageTag& tag) { + MOZ_ASSERT(localeId.back() == '\0', + "Locale ID should be zero-terminated for ICU"); + + // Replace the ICU locale ID separator. + std::replace(localeId.begin(), localeId.end(), '_', '-'); + + // ICU replaces "und" with the empty string, which means "und" becomes "" and + // "und-Latn" becomes "-Latn". Handle this case separately. + if (localeId[0] == '\0' || localeId[0] == '-') { + static constexpr char und[] = "und"; + constexpr size_t length = std::char_traits::length(und); + + // Insert "und" in front of the locale ID. + if (!localeId.growBy(length)) { + return false; + } + memmove(localeId.begin() + length, localeId.begin(), localeId.length()); + memmove(localeId.begin(), und, length); + } + + mozilla::Span localeSpan(localeId.begin(), localeId.length() - 1); + + // Retrieve the language, script, and region subtags from the locale ID, but + // ignore any other subtags. + LanguageTag localeTag(cx); + if (!LanguageTagParser::parseBaseName(cx, localeSpan, localeTag)) { + return false; + } + + tag.setLanguage(localeTag.language()); + tag.setScript(localeTag.script()); + tag.setRegion(localeTag.region()); + + return true; +} + +template +static bool CallLikelySubtags(JSContext* cx, const LocaleId& localeId, + LocaleId& result) { + // Locale ID must be zero-terminated before passing it to ICU. + MOZ_ASSERT(localeId.back() == '\0'); + MOZ_ASSERT(result.length() == 0); + + // Ensure there's enough room for the result. + MOZ_ALWAYS_TRUE(result.resize(LocaleId::InlineLength)); + + int32_t length = intl::CallICU( + cx, + [&localeId](char* chars, int32_t size, UErrorCode* status) { + return likelySubtagsFn(localeId.begin(), chars, size, status); + }, + result); + if (length < 0) { + return false; + } + + MOZ_ASSERT( + size_t(length) <= LocaleId::InlineLength, + "Unexpected extra subtags were added by ICU. If this assertion ever " + "fails, simply remove it and move on like nothing ever happended."); + + // Resize the vector to the actual string length. + result.shrinkTo(length); + + // Zero-terminated for use with ICU. + return result.append('\0'); +} + +// The canonical way to compute the Unicode BCP 47 locale identifier with likely +// subtags is as follows: +// +// 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU +// locale ID. +// 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID. +// 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into +// a Unicode BCP 47 locale identifier. +// +// Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow +// and we know, by construction, that the input Unicode BCP 47 locale identifier +// only contains valid language, script, and region subtags, we can avoid both +// calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and +// AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of +// |Intl.Locale.prototype.maximize|.) +static bool LikelySubtags(JSContext* cx, LikelySubtags likelySubtags, + LanguageTag& tag) { + // Return early if the input is already maximized/minimized. + if (HasLikelySubtags(likelySubtags, tag)) { + return true; + } + + // Create the locale ID for the input argument. + LocaleId locale(cx); + if (!CreateLocaleForLikelySubtags(tag, locale)) { + return false; + } + + // Either add or remove likely subtags to/from the locale ID. + LocaleId localeLikelySubtags(cx); + if (likelySubtags == LikelySubtags::Add) { + if (!CallLikelySubtags(cx, locale, + localeLikelySubtags)) { + return false; + } + } else { + if (!CallLikelySubtags(cx, locale, + localeLikelySubtags)) { + return false; + } + } + + // Assign the language, script, and region subtags from the locale ID. + if (!AssignFromLocaleId(cx, localeLikelySubtags, tag)) { + return false; + } + + // Update mappings in case ICU returned a non-canonical locale. + return tag.canonicalizeBaseName(cx); +} + +bool LanguageTag::addLikelySubtags(JSContext* cx) { + return LikelySubtags(cx, LikelySubtags::Add, *this); +} + +bool LanguageTag::removeLikelySubtags(JSContext* cx) { + return LikelySubtags(cx, LikelySubtags::Remove, *this); +} + +LanguageTagParser::Token LanguageTagParser::nextToken() { + MOZ_ASSERT(index_ <= length_ + 1, "called after 'None' token was read"); + + TokenKind kind = TokenKind::None; + size_t tokenLength = 0; + for (size_t i = index_; i < length_; i++) { + // UTS 35, section 3.1. + // alpha = [A-Z a-z] ; + // digit = [0-9] ; + char16_t c = charAtUnchecked(i); + if (mozilla::IsAsciiAlpha(c)) { + kind |= TokenKind::Alpha; + } else if (mozilla::IsAsciiDigit(c)) { + kind |= TokenKind::Digit; + } else if (c == '-' && i > index_ && i + 1 < length_) { + break; + } else { + return {TokenKind::Error, 0, 0}; + } + tokenLength += 1; + } + + Token token{kind, index_, tokenLength}; + index_ += tokenLength + 1; + return token; +} + +UniqueChars LanguageTagParser::chars(JSContext* cx, size_t index, + size_t length) const { + // Add +1 to null-terminate the string. + auto chars = cx->make_pod_array(length + 1); + if (chars) { + char* dest = chars.get(); + if (locale_.is()) { + std::copy_n(locale_.as() + index, length, dest); + } else { + std::copy_n(locale_.as() + index, length, dest); + } + dest[length] = '\0'; + } + return chars; +} + +// Parse the `unicode_language_id` production. +// +// unicode_language_id = unicode_language_subtag +// (sep unicode_script_subtag)? +// (sep unicode_region_subtag)? +// (sep unicode_variant_subtag)* ; +// +// sep = "-" +// +// Note: Unicode CLDR locale identifier backward compatibility extensions +// removed from `unicode_language_id`. +// +// |tok| is the current token from |ts|. +// +// All subtags will be added unaltered to |tag|, without canonicalizing their +// case or, in the case of variant subtags, detecting and rejecting duplicate +// variants. Users must subsequently |canonicalizeBaseName| to perform these +// actions. +// +// Do not use this function directly: use |parseBaseName| or +// |parseTlangFromTransformExtension| instead. +JS::Result LanguageTagParser::internalParseBaseName(JSContext* cx, + LanguageTagParser& ts, + LanguageTag& tag, + Token& tok) { + if (ts.isLanguage(tok)) { + ts.copyChars(tok, tag.language_); + + tok = ts.nextToken(); + } else { + // The language subtag is mandatory. + return false; + } + + if (ts.isScript(tok)) { + ts.copyChars(tok, tag.script_); + + tok = ts.nextToken(); + } + + if (ts.isRegion(tok)) { + ts.copyChars(tok, tag.region_); + + tok = ts.nextToken(); + } + + auto& variants = tag.variants_; + MOZ_ASSERT(variants.length() == 0); + while (ts.isVariant(tok)) { + auto variant = ts.chars(cx, tok); + if (!variant) { + return cx->alreadyReportedOOM(); + } + if (!variants.append(std::move(variant))) { + return cx->alreadyReportedOOM(); + } + + tok = ts.nextToken(); + } + + return true; +} + +static mozilla::Variant StringChars( + const char* locale) { + return mozilla::AsVariant(reinterpret_cast(locale)); +} + +static mozilla::Variant StringChars( + JSLinearString* linear, JS::AutoCheckCannotGC& nogc) { + if (linear->hasLatin1Chars()) { + return mozilla::AsVariant(linear->latin1Chars(nogc)); + } + return mozilla::AsVariant(linear->twoByteChars(nogc)); +} + +JS::Result LanguageTagParser::tryParse(JSContext* cx, + JSLinearString* locale, + LanguageTag& tag) { + JS::AutoCheckCannotGC nogc; + LocaleChars localeChars = StringChars(locale, nogc); + return tryParse(cx, localeChars, locale->length(), tag); +} + +JS::Result LanguageTagParser::tryParse(JSContext* cx, + mozilla::Span locale, + LanguageTag& tag) { + LocaleChars localeChars = StringChars(locale.data()); + return tryParse(cx, localeChars, locale.size(), tag); +} + +JS::Result LanguageTagParser::tryParse(JSContext* cx, + LocaleChars& localeChars, + size_t localeLength, + LanguageTag& tag) { + // unicode_locale_id = unicode_language_id + // extensions* + // pu_extensions? ; + + LanguageTagParser ts(localeChars, localeLength); + Token tok = ts.nextToken(); + + bool ok; + MOZ_TRY_VAR(ok, parseBaseName(cx, ts, tag, tok)); + if (!ok) { + return false; + } + + // extensions = unicode_locale_extensions + // | transformed_extensions + // | other_extensions ; + + // Bit set of seen singletons. + uint64_t seenSingletons = 0; + + auto& extensions = tag.extensions_; + while (ts.isExtensionStart(tok)) { + char singleton = ts.singletonKey(tok); + + // Reject the input if a duplicate singleton was found. + uint64_t hash = 1ULL << (mozilla::AsciiAlphanumericToNumber(singleton) + 1); + if (seenSingletons & hash) { + return false; + } + seenSingletons |= hash; + + Token start = tok; + tok = ts.nextToken(); + + // We'll check for missing non-singleton subtags after this block by + // comparing |startValue| with the then-current position. + size_t startValue = tok.index(); + + if (singleton == 'u') { + while (ts.isUnicodeExtensionPart(tok)) { + tok = ts.nextToken(); + } + } else if (singleton == 't') { + // transformed_extensions = sep [tT] + // ((sep tlang (sep tfield)*) + // | (sep tfield)+) ; + + // tlang = unicode_language_subtag + // (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? + // (sep unicode_variant_subtag)* ; + if (ts.isLanguage(tok)) { + tok = ts.nextToken(); + + if (ts.isScript(tok)) { + tok = ts.nextToken(); + } + + if (ts.isRegion(tok)) { + tok = ts.nextToken(); + } + + while (ts.isVariant(tok)) { + tok = ts.nextToken(); + } + } + + // tfield = tkey tvalue; + while (ts.isTransformExtensionKey(tok)) { + tok = ts.nextToken(); + + size_t startTValue = tok.index(); + while (ts.isTransformExtensionPart(tok)) { + tok = ts.nextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.index() <= startTValue) { + return false; + } + } + } else { + while (ts.isOtherExtensionPart(tok)) { + tok = ts.nextToken(); + } + } + + // Singletons must be followed by a non-singleton subtag, "en-a-b" is not + // allowed. + if (tok.index() <= startValue) { + return false; + } + + UniqueChars extension = ts.extension(cx, start, tok); + if (!extension) { + return cx->alreadyReportedOOM(); + } + if (!extensions.append(std::move(extension))) { + return cx->alreadyReportedOOM(); + } + } + + // Trailing `pu_extension` component of the `unicode_locale_id` production. + if (ts.isPrivateUseStart(tok)) { + Token start = tok; + tok = ts.nextToken(); + + size_t startValue = tok.index(); + while (ts.isPrivateUsePart(tok)) { + tok = ts.nextToken(); + } + + // There must be at least one subtag after the "-x-". + if (tok.index() <= startValue) { + return false; + } + + UniqueChars privateUse = ts.extension(cx, start, tok); + if (!privateUse) { + return cx->alreadyReportedOOM(); + } + tag.privateuse_ = std::move(privateUse); + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +bool LanguageTagParser::parse(JSContext* cx, JSLinearString* locale, + LanguageTag& tag) { + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag)); + if (ok) { + return true; + } + if (UniqueChars localeChars = QuoteString(cx, locale, '"')) { JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); } return false; } +bool LanguageTagParser::parse(JSContext* cx, mozilla::Span locale, + LanguageTag& tag) { + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, tryParse(cx, locale, tag)); + if (ok) { + return true; + } + if (UniqueChars localeChars = + DuplicateString(cx, locale.data(), locale.size())) { + JS_ReportErrorNumberUTF8(cx, GetErrorMessage, nullptr, + JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); + } + return false; +} + +bool LanguageTagParser::parseBaseName(JSContext* cx, + mozilla::Span locale, + LanguageTag& tag) { + LocaleChars localeChars = StringChars(locale.data()); + LanguageTagParser ts(localeChars, locale.size()); + Token tok = ts.nextToken(); + + // Parse only the base-name part and ignore any trailing characters. + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, parseBaseName(cx, ts, tag, tok)); + if (ok) { + return true; + } + if (UniqueChars localeChars = + DuplicateString(cx, locale.data(), locale.size())) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_INVALID_LANGUAGE_TAG, localeChars.get()); + } + return false; +} + +JS::Result LanguageTagParser::tryParseBaseName(JSContext* cx, + JSLinearString* locale, + LanguageTag& tag) { + JS::AutoCheckCannotGC nogc; + LocaleChars localeChars = StringChars(locale, nogc); + LanguageTagParser ts(localeChars, locale->length()); + Token tok = ts.nextToken(); + + // Return true if the complete input was successfully parsed. + bool ok; + MOZ_TRY_VAR(ok, parseBaseName(cx, ts, tag, tok)); + return ok && tok.isNone(); +} + +// Parse |extension|, which must be a valid `transformed_extensions` subtag, and +// fill |tag| and |fields| from the `tlang` and `tfield` components. +JS::Result LanguageTagParser::parseTransformExtension( + JSContext* cx, mozilla::Span extension, LanguageTag& tag, + TFieldVector& fields) { + LocaleChars extensionChars = StringChars(extension.data()); + LanguageTagParser ts(extensionChars, extension.size()); + Token tok = ts.nextToken(); + + if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 't') { + return false; + } + + tok = ts.nextToken(); + + if (tok.isNone()) { + return false; + } + + if (ts.isLanguage(tok)) { + // We're parsing a possible `tlang` in a known-valid transform extension, so + // use the special-purpose function that takes advantage of this to compute + // lowercased |tag| contents in an optimal manner. + MOZ_TRY(parseTlangInTransformExtension(cx, ts, tag, tok)); + + // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end + // of the transform extension. + MOZ_ASSERT(ts.isTransformExtensionKey(tok) || tok.isNone()); + } else { + // If there's no `tlang` subtag, at least one `tfield` must be present. + MOZ_ASSERT(ts.isTransformExtensionKey(tok)); + } + + // Trailing `tfield` subtags. (Any other trailing subtags are an error, + // because we're guaranteed to only see a valid tranform extension here.) + while (ts.isTransformExtensionKey(tok)) { + size_t begin = tok.index(); + tok = ts.nextToken(); + + size_t startTValue = tok.index(); + while (ts.isTransformExtensionPart(tok)) { + tok = ts.nextToken(); + } + + // `tfield` requires at least one `tvalue`. + if (tok.index() <= startTValue) { + return false; + } + + size_t length = tok.index() - 1 - begin; + if (!fields.emplaceBack(begin, length)) { + return cx->alreadyReportedOOM(); + } + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +// Parse |extension|, which must be a valid `unicode_locale_extensions` subtag, +// and fill |attributes| and |keywords| from the `attribute` and `keyword` +// components. +JS::Result LanguageTagParser::parseUnicodeExtension( + JSContext* cx, mozilla::Span extension, + AttributesVector& attributes, KeywordsVector& keywords) { + LocaleChars extensionChars = StringChars(extension.data()); + LanguageTagParser ts(extensionChars, extension.size()); + Token tok = ts.nextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') { + return false; + } + + tok = ts.nextToken(); + + if (tok.isNone()) { + return false; + } + + while (ts.isUnicodeExtensionAttribute(tok)) { + if (!attributes.emplaceBack(tok.index(), tok.length())) { + return cx->alreadyReportedOOM(); + } + + tok = ts.nextToken(); + } + + // keyword = key (sep type)? ; + while (ts.isUnicodeExtensionKey(tok)) { + size_t begin = tok.index(); + tok = ts.nextToken(); + + while (ts.isUnicodeExtensionType(tok)) { + tok = ts.nextToken(); + } + + if (tok.isError()) { + return false; + } + + size_t length = tok.index() - 1 - begin; + if (!keywords.emplaceBack(begin, length)) { + return cx->alreadyReportedOOM(); + } + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +bool LanguageTagParser::canParseUnicodeExtension( + mozilla::Span extension) { + LocaleChars extensionChars = StringChars(extension.data()); + LanguageTagParser ts(extensionChars, extension.size()); + Token tok = ts.nextToken(); + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + + if (!ts.isExtensionStart(tok) || ts.singletonKey(tok) != 'u') { + return false; + } + + tok = ts.nextToken(); + + if (tok.isNone()) { + return false; + } + + while (ts.isUnicodeExtensionAttribute(tok)) { + tok = ts.nextToken(); + } + + // keyword = key (sep type)? ; + while (ts.isUnicodeExtensionKey(tok)) { + tok = ts.nextToken(); + + while (ts.isUnicodeExtensionType(tok)) { + tok = ts.nextToken(); + } + + if (tok.isError()) { + return false; + } + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + +bool LanguageTagParser::canParseUnicodeExtensionType( + JSLinearString* unicodeType) { + MOZ_ASSERT(unicodeType->length() > 0, "caller must exclude empty strings"); + + JS::AutoCheckCannotGC nogc; + LocaleChars unicodeTypeChars = StringChars(unicodeType, nogc); + + LanguageTagParser ts(unicodeTypeChars, unicodeType->length()); + Token tok = ts.nextToken(); + + while (ts.isUnicodeExtensionType(tok)) { + tok = ts.nextToken(); + } + + // Return true if the complete input was successfully parsed. + return tok.isNone(); +} + bool ParseStandaloneLanguageTag(HandleLinearString str, - mozilla::intl::LanguageSubtag& result) { - // Tell the analysis the |IsStructurallyValidLanguageTag| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - + LanguageSubtag& result) { + JS::AutoCheckCannotGC nogc; if (str->hasLatin1Chars()) { - if (!mozilla::intl::IsStructurallyValidLanguageTag( - str->latin1Range(nogc))) { + if (!IsStructurallyValidLanguageTag(str->latin1Range(nogc))) { return false; } result.set(str->latin1Range(nogc)); } else { - if (!mozilla::intl::IsStructurallyValidLanguageTag( - str->twoByteRange(nogc))) { + if (!IsStructurallyValidLanguageTag(str->twoByteRange(nogc))) { return false; } result.set(str->twoByteRange(nogc)); @@ -59,20 +1594,15 @@ bool ParseStandaloneLanguageTag(HandleLinearString str, return true; } -bool ParseStandaloneScriptTag(HandleLinearString str, - mozilla::intl::ScriptSubtag& result) { - // Tell the analysis the |IsStructurallyValidScriptTag| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - +bool ParseStandaloneScriptTag(HandleLinearString str, ScriptSubtag& result) { + JS::AutoCheckCannotGC nogc; if (str->hasLatin1Chars()) { - if (!mozilla::intl::IsStructurallyValidScriptTag( - str->latin1Range(nogc))) { + if (!IsStructurallyValidScriptTag(str->latin1Range(nogc))) { return false; } result.set(str->latin1Range(nogc)); } else { - if (!mozilla::intl::IsStructurallyValidScriptTag( - str->twoByteRange(nogc))) { + if (!IsStructurallyValidScriptTag(str->twoByteRange(nogc))) { return false; } result.set(str->twoByteRange(nogc)); @@ -80,20 +1610,15 @@ bool ParseStandaloneScriptTag(HandleLinearString str, return true; } -bool ParseStandaloneRegionTag(HandleLinearString str, - mozilla::intl::RegionSubtag& result) { - // Tell the analysis the |IsStructurallyValidRegionTag| function can't GC. - JS::AutoSuppressGCAnalysis nogc; - +bool ParseStandaloneRegionTag(HandleLinearString str, RegionSubtag& result) { + JS::AutoCheckCannotGC nogc; if (str->hasLatin1Chars()) { - if (!mozilla::intl::IsStructurallyValidRegionTag( - str->latin1Range(nogc))) { + if (!IsStructurallyValidRegionTag(str->latin1Range(nogc))) { return false; } result.set(str->latin1Range(nogc)); } else { - if (!mozilla::intl::IsStructurallyValidRegionTag( - str->twoByteRange(nogc))) { + if (!IsStructurallyValidRegionTag(str->twoByteRange(nogc))) { return false; } result.set(str->twoByteRange(nogc)); @@ -154,7 +1679,7 @@ JS::Result ParseStandaloneISO639LanguageTag(JSContext* cx, } } - mozilla::intl::LanguageSubtag languageTag; + LanguageSubtag languageTag; if (str->hasLatin1Chars()) { JS::AutoCheckCannotGC nogc; languageTag.set(str->latin1Range(nogc)); @@ -170,13 +1695,13 @@ JS::Result ParseStandaloneISO639LanguageTag(JSContext* cx, // Reject the input if the canonical tag contains more than just a single // language subtag. - if (mozilla::intl::Locale::complexLanguageMapping(languageTag)) { + if (LanguageTag::complexLanguageMapping(languageTag)) { return nullptr; } // Take care to replace deprecated subtags with their preferred values. JSString* result; - if (mozilla::intl::Locale::languageMapping(languageTag) || !isLowerCase) { + if (LanguageTag::languageMapping(languageTag) || !isLowerCase) { result = NewStringCopy(cx, languageTag.span()); } else { result = str; diff --git a/js/src/builtin/intl/LanguageTag.h b/js/src/builtin/intl/LanguageTag.h index e7a91168a31c..3abfaf1cc573 100644 --- a/js/src/builtin/intl/LanguageTag.h +++ b/js/src/builtin/intl/LanguageTag.h @@ -9,8 +9,17 @@ #ifndef builtin_intl_LanguageTag_h #define builtin_intl_LanguageTag_h -#include "mozilla/intl/Locale.h" +#include "mozilla/Assertions.h" #include "mozilla/Span.h" +#include "mozilla/TextUtils.h" +#include "mozilla/TypedEnumBits.h" +#include "mozilla/Variant.h" + +#include +#include +#include +#include +#include #include "js/AllocPolicy.h" #include "js/GCAPI.h" @@ -29,32 +38,702 @@ namespace js { namespace intl { /** - * Parse a string Unicode BCP 47 locale identifier. If successful, store in - * |result| and return true. Otherwise return false. + * Return true if |language| is a valid language subtag. */ -[[nodiscard]] bool ParseLocale(JSContext* cx, JS::Handle str, - mozilla::intl::Locale& result); +template +bool IsStructurallyValidLanguageTag(mozilla::Span language); + +/** + * Return true if |script| is a valid script subtag. + */ +template +bool IsStructurallyValidScriptTag(mozilla::Span script); + +/** + * Return true if |region| is a valid region subtag. + */ +template +bool IsStructurallyValidRegionTag(mozilla::Span region); + +#ifdef DEBUG +/** + * Return true if |variant| is a valid variant subtag. + */ +bool IsStructurallyValidVariantTag(mozilla::Span variant); + +/** + * Return true if |extension| is a valid Unicode extension subtag. + */ +bool IsStructurallyValidUnicodeExtensionTag( + mozilla::Span extension); + +/** + * Return true if |privateUse| is a valid private-use subtag. + */ +bool IsStructurallyValidPrivateUseTag(mozilla::Span privateUse); + +#endif + +template +char AsciiToLowerCase(CharT c) { + MOZ_ASSERT(mozilla::IsAscii(c)); + return mozilla::IsAsciiUppercaseAlpha(c) ? (c + 0x20) : c; +} + +template +char AsciiToUpperCase(CharT c) { + MOZ_ASSERT(mozilla::IsAscii(c)); + return mozilla::IsAsciiLowercaseAlpha(c) ? (c - 0x20) : c; +} + +template +void AsciiToLowerCase(CharT* chars, size_t length, char* dest) { + // Tell the analysis the |std::transform| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + char (&fn)(CharT) = AsciiToLowerCase; + std::transform(chars, chars + length, dest, fn); +} + +template +void AsciiToUpperCase(CharT* chars, size_t length, char* dest) { + // Tell the analysis the |std::transform| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + char (&fn)(CharT) = AsciiToUpperCase; + std::transform(chars, chars + length, dest, fn); +} + +template +void AsciiToTitleCase(CharT* chars, size_t length, char* dest) { + if (length > 0) { + AsciiToUpperCase(chars, 1, dest); + AsciiToLowerCase(chars + 1, length - 1, dest + 1); + } +} + +// Constants for language subtag lengths. +namespace LanguageTagLimits { + +// unicode_language_subtag = alpha{2,3} | alpha{5,8} ; +static constexpr size_t LanguageLength = 8; + +// unicode_script_subtag = alpha{4} ; +static constexpr size_t ScriptLength = 4; + +// unicode_region_subtag = (alpha{2} | digit{3}) ; +static constexpr size_t RegionLength = 3; +static constexpr size_t AlphaRegionLength = 2; +static constexpr size_t DigitRegionLength = 3; + +// key = alphanum alpha ; +static constexpr size_t UnicodeKeyLength = 2; + +// tkey = alpha digit ; +static constexpr size_t TransformKeyLength = 2; + +} // namespace LanguageTagLimits + +// Fixed size language subtag which is stored inline in LanguageTag. +template +class LanguageTagSubtag final { + uint8_t length_ = 0; + char chars_[Length] = {}; // zero initialize + + public: + LanguageTagSubtag() = default; + + LanguageTagSubtag(const LanguageTagSubtag&) = delete; + LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete; + + size_t length() const { return length_; } + bool missing() const { return length_ == 0; } + bool present() const { return length_ > 0; } + + mozilla::Span span() const { return {chars_, length_}; } + + template + void set(mozilla::Span str) { + MOZ_ASSERT(str.size() <= Length); + std::copy_n(str.data(), str.size(), chars_); + length_ = str.size(); + } + + // The toXYZCase() methods are using |Length| instead of |length()|, because + // current compilers (tested GCC and Clang) can't infer the maximum string + // length - even when using hints like |std::min| - and instead are emitting + // SIMD optimized code. Using a fixed sized length avoids emitting the SIMD + // code. (Emitting SIMD code doesn't make sense here, because the SIMD code + // only kicks in for long strings.) A fixed length will additionally ensure + // the compiler unrolls the loop in the case conversion code. + + void toLowerCase() { AsciiToLowerCase(chars_, Length, chars_); } + + void toUpperCase() { AsciiToUpperCase(chars_, Length, chars_); } + + void toTitleCase() { AsciiToTitleCase(chars_, Length, chars_); } + + template + bool equalTo(const char (&str)[N]) const { + static_assert(N - 1 <= Length, + "subtag literals must not exceed the maximum subtag length"); + + return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0; + } +}; + +using LanguageSubtag = LanguageTagSubtag; +using ScriptSubtag = LanguageTagSubtag; +using RegionSubtag = LanguageTagSubtag; + +/** + * Object representing a language tag. + * + * All subtags are already in canonicalized case. + */ +class MOZ_STACK_CLASS LanguageTag final { + LanguageSubtag language_ = {}; + ScriptSubtag script_ = {}; + RegionSubtag region_ = {}; + + using VariantsVector = Vector; + using ExtensionsVector = Vector; + + VariantsVector variants_; + ExtensionsVector extensions_; + JS::UniqueChars privateuse_ = nullptr; + + friend class LanguageTagParser; + + bool canonicalizeUnicodeExtension(JSContext* cx, + JS::UniqueChars& unicodeExtension); + + bool canonicalizeTransformExtension(JSContext* cx, + JS::UniqueChars& transformExtension); + + public: + static bool languageMapping(LanguageSubtag& language); + static bool complexLanguageMapping(const LanguageSubtag& language); + + private: + static bool scriptMapping(ScriptSubtag& script); + static bool regionMapping(RegionSubtag& region); + static bool complexRegionMapping(const RegionSubtag& region); + + void performComplexLanguageMappings(); + void performComplexRegionMappings(); + [[nodiscard]] bool performVariantMappings(JSContext* cx); + + [[nodiscard]] bool updateLegacyMappings(JSContext* cx); + + static bool signLanguageMapping(LanguageSubtag& language, + const RegionSubtag& region); + + static const char* replaceTransformExtensionType( + mozilla::Span key, mozilla::Span type); + + public: + /** + * Given a Unicode key and type, return the null-terminated preferred + * replacement for that type if there is one, or null if there is none, e.g. + * in effect + * |replaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"| + * and + * |replaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|. + */ + static const char* replaceUnicodeExtensionType( + mozilla::Span key, mozilla::Span type); + + public: + explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {} + + LanguageTag(const LanguageTag&) = delete; + LanguageTag& operator=(const LanguageTag&) = delete; + + const LanguageSubtag& language() const { return language_; } + const ScriptSubtag& script() const { return script_; } + const RegionSubtag& region() const { return region_; } + const auto& variants() const { return variants_; } + const auto& extensions() const { return extensions_; } + const char* privateuse() const { return privateuse_.get(); } + + /** + * Return the Unicode extension subtag or nullptr if not present. + */ + const char* unicodeExtension() const; + + private: + ptrdiff_t unicodeExtensionIndex() const; + + public: + /** + * Set the language subtag. The input must be a valid language subtag. + */ + template + void setLanguage(const char (&language)[N]) { + mozilla::Span span(language, N - 1); + MOZ_ASSERT(IsStructurallyValidLanguageTag(span)); + language_.set(span); + } + + /** + * Set the language subtag. The input must be a valid language subtag. + */ + void setLanguage(const LanguageSubtag& language) { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span())); + language_.set(language.span()); + } + + /** + * Set the script subtag. The input must be a valid script subtag. + */ + template + void setScript(const char (&script)[N]) { + mozilla::Span span(script, N - 1); + MOZ_ASSERT(IsStructurallyValidScriptTag(span)); + script_.set(span); + } + + /** + * Set the script subtag. The input must be a valid script subtag or the empty + * string. + */ + void setScript(const ScriptSubtag& script) { + MOZ_ASSERT(script.missing() || IsStructurallyValidScriptTag(script.span())); + script_.set(script.span()); + } + + /** + * Set the region subtag. The input must be a valid region subtag. + */ + template + void setRegion(const char (®ion)[N]) { + mozilla::Span span(region, N - 1); + MOZ_ASSERT(IsStructurallyValidRegionTag(span)); + region_.set(span); + } + + /** + * Set the region subtag. The input must be a valid region subtag or the empty + * empty string. + */ + void setRegion(const RegionSubtag& region) { + MOZ_ASSERT(region.missing() || IsStructurallyValidRegionTag(region.span())); + region_.set(region.span()); + } + + /** + * Removes all variant subtags. + */ + void clearVariants() { variants_.clearAndFree(); } + + /** + * Set the Unicode extension subtag. The input must be a valid Unicode + * extension subtag. + */ + bool setUnicodeExtension(JS::UniqueChars extension); + + /** + * Remove any Unicode extension subtag if present. + */ + void clearUnicodeExtension(); + + /** + * Set the private-use subtag. The input must be a valid private-use subtag + * or nullptr. + */ + void setPrivateuse(JS::UniqueChars privateuse) { + MOZ_ASSERT(!privateuse || + IsStructurallyValidPrivateUseTag( + {privateuse.get(), strlen(privateuse.get())})); + privateuse_ = std::move(privateuse); + } + + /** Canonicalize the base-name (language, script, region, variant) subtags. */ + bool canonicalizeBaseName(JSContext* cx); + + /** + * Canonicalize all extension subtags. + */ + bool canonicalizeExtensions(JSContext* cx); + + /** + * Canonicalizes the given structurally valid Unicode BCP 47 locale + * identifier, including regularized case of subtags. For example, the + * language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, + * where + * + * Zh ; 2*3ALPHA + * -haNS ; ["-" script] + * -bu ; ["-" region] + * -variant2 ; *("-" variant) + * -Variant1 + * -u-ca-chinese ; *("-" extension) + * -t-Zh-laTN + * -x-PRIVATE ; ["-" privateuse] + * + * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private + * + * Spec: ECMAScript Internationalization API Specification, 6.2.3. + */ + bool canonicalize(JSContext* cx) { + return canonicalizeBaseName(cx) && canonicalizeExtensions(cx); + } + + /** + * Return the string representation of this language tag. + */ + JSString* toString(JSContext* cx) const; + + /** + * Return the string representation of this language tag as a null-terminated + * C-string. + */ + JS::UniqueChars toStringZ(JSContext* cx) const; + + /** + * Add likely-subtags to the language tag. + * + * Spec: + */ + bool addLikelySubtags(JSContext* cx); + + /** + * Remove likely-subtags from the language tag. + * + * Spec: + */ + bool removeLikelySubtags(JSContext* cx); +}; + +/** + * Parser for Unicode BCP 47 locale identifiers. + * + * + */ +class MOZ_STACK_CLASS LanguageTagParser final { + public: + // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|. + enum class TokenKind : uint8_t { + None = 0b000, + Alpha = 0b001, + Digit = 0b010, + AlphaDigit = 0b011, + Error = 0b100 + }; + + private: + class Token final { + size_t index_; + size_t length_; + TokenKind kind_; + + public: + Token(TokenKind kind, size_t index, size_t length) + : index_(index), length_(length), kind_(kind) {} + + TokenKind kind() const { return kind_; } + size_t index() const { return index_; } + size_t length() const { return length_; } + + bool isError() const { return kind_ == TokenKind::Error; } + bool isNone() const { return kind_ == TokenKind::None; } + bool isAlpha() const { return kind_ == TokenKind::Alpha; } + bool isDigit() const { return kind_ == TokenKind::Digit; } + bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; } + }; + + using LocaleChars = mozilla::Variant; + + const LocaleChars& locale_; + size_t length_; + size_t index_ = 0; + + LanguageTagParser(const LocaleChars& locale, size_t length) + : locale_(locale), length_(length) {} + + char16_t charAtUnchecked(size_t index) const { + if (locale_.is()) { + return locale_.as()[index]; + } + return locale_.as()[index]; + } + + char charAt(size_t index) const { + char16_t c = charAtUnchecked(index); + MOZ_ASSERT(mozilla::IsAscii(c)); + return c; + } + + // Copy the token characters into |subtag|. + template + void copyChars(const Token& tok, LanguageTagSubtag& subtag) const { + size_t index = tok.index(); + size_t length = tok.length(); + if (locale_.is()) { + using T = const JS::Latin1Char; + subtag.set(mozilla::Span(locale_.as() + index, length)); + } else { + using T = const char16_t; + subtag.set(mozilla::Span(locale_.as() + index, length)); + } + } + + // Create a string copy of |length| characters starting at |index|. + JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const; + + // Create a string copy of the token characters. + JS::UniqueChars chars(JSContext* cx, const Token& tok) const { + return chars(cx, tok.index(), tok.length()); + } + + JS::UniqueChars extension(JSContext* cx, const Token& start, + const Token& end) const { + MOZ_ASSERT(start.index() < end.index()); + + size_t length = end.index() - 1 - start.index(); + return chars(cx, start.index(), length); + } + + Token nextToken(); + + // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + // + // Four character language subtags are not allowed in Unicode BCP 47 locale + // identifiers. Also see the comparison to Unicode CLDR locale identifiers in + // . + bool isLanguage(const Token& tok) const { + return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) || + (5 <= tok.length() && tok.length() <= 8)); + } + + // unicode_script_subtag = alpha{4} ; + bool isScript(const Token& tok) const { + return tok.isAlpha() && tok.length() == 4; + } + + // unicode_region_subtag = (alpha{2} | digit{3}) ; + bool isRegion(const Token& tok) const { + return (tok.isAlpha() && tok.length() == 2) || + (tok.isDigit() && tok.length() == 3); + } + + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + bool isVariant(const Token& tok) const { + return (5 <= tok.length() && tok.length() <= 8) || + (tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index()))); + } + + // Returns the code unit of the first character at the given singleton token. + // Always returns the lower case form of an alphabetical character. + char singletonKey(const Token& tok) const { + MOZ_ASSERT(tok.length() == 1); + return AsciiToLowerCase(charAt(tok.index())); + } + + // extensions = unicode_locale_extensions | + // transformed_extensions | + // other_extensions ; + // + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + // + // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | + // (sep tfield)+) ; + // + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + bool isExtensionStart(const Token& tok) const { + return tok.length() == 1 && singletonKey(tok) != 'x'; + } + + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + bool isOtherExtensionPart(const Token& tok) const { + return 2 <= tok.length() && tok.length() <= 8; + } + + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | + // (sep attribute)+ (sep keyword)*) ; + // keyword = key (sep type)? ; + bool isUnicodeExtensionPart(const Token& tok) const { + return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) || + isUnicodeExtensionAttribute(tok); + } + + // attribute = alphanum{3,8} ; + bool isUnicodeExtensionAttribute(const Token& tok) const { + return 3 <= tok.length() && tok.length() <= 8; + } + + // key = alphanum alpha ; + bool isUnicodeExtensionKey(const Token& tok) const { + return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1)); + } + + // type = alphanum{3,8} (sep alphanum{3,8})* ; + bool isUnicodeExtensionType(const Token& tok) const { + return 3 <= tok.length() && tok.length() <= 8; + } + + // tkey = alpha digit ; + bool isTransformExtensionKey(const Token& tok) const { + return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) && + mozilla::IsAsciiDigit(charAt(tok.index() + 1)); + } + + // tvalue = (sep alphanum{3,8})+ ; + bool isTransformExtensionPart(const Token& tok) const { + return 3 <= tok.length() && tok.length() <= 8; + } + + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + bool isPrivateUseStart(const Token& tok) const { + return tok.length() == 1 && singletonKey(tok) == 'x'; + } + + // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; + bool isPrivateUsePart(const Token& tok) const { + return 1 <= tok.length() && tok.length() <= 8; + } + + // Helper function for use in |parseBaseName| and + // |parseTlangInTransformExtension|. Do not use this directly! + static JS::Result internalParseBaseName(JSContext* cx, + LanguageTagParser& ts, + LanguageTag& tag, Token& tok); + + // Parse the `unicode_language_id` production, i.e. the + // language/script/region/variants portion of a language tag, into |tag|. + // |tok| must be the current token. + static JS::Result parseBaseName(JSContext* cx, LanguageTagParser& ts, + LanguageTag& tag, Token& tok) { + return internalParseBaseName(cx, ts, tag, tok); + } + + // Parse the `tlang` production within a parsed 't' transform extension. + // The precise requirements for "previously parsed" are: + // + // * the input begins from current token |tok| with a valid `tlang` + // * the `tlang` is wholly lowercase (*not* canonical case) + // * variant subtags in the `tlang` may contain duplicates and be + // unordered + // + // Return an error on internal failure. Otherwise, return a success value. If + // there was no `tlang`, then |tag.language().missing()|. But if there was a + // `tlang`, then |tag| is filled with subtags exactly as they appeared in the + // parse input. + static JS::Result parseTlangInTransformExtension( + JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) { + MOZ_ASSERT(ts.isLanguage(tok)); + return internalParseBaseName(cx, ts, tag, tok).map([](bool parsed) { + MOZ_ASSERT(parsed); + return JS::Ok(); + }); + } + + friend class LanguageTag; + + class Range final { + size_t begin_; + size_t length_; + + public: + Range(size_t begin, size_t length) : begin_(begin), length_(length) {} + + template + T* begin(T* ptr) const { + return ptr + begin_; + } + + size_t length() const { return length_; } + }; + + using TFieldVector = js::Vector; + using AttributesVector = js::Vector; + using KeywordsVector = js::Vector; + + // Parse |extension|, which must be a validated, fully lowercase + // `transformed_extensions` subtag, and fill |tag| and |fields| from the + // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent + // with |extension|. + static JS::Result parseTransformExtension( + JSContext* cx, mozilla::Span extension, LanguageTag& tag, + TFieldVector& fields); + + // Parse |extension|, which must be a validated, fully lowercase + // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords| + // from the `attribute` and `keyword` components. + static JS::Result parseUnicodeExtension( + JSContext* cx, mozilla::Span extension, + AttributesVector& attributes, KeywordsVector& keywords); + + static JS::Result tryParse(JSContext* cx, LocaleChars& localeChars, + size_t localeLength, LanguageTag& tag); + + public: + // Parse the input string as a language tag. Reports an error to the context + // if the input can't be parsed completely. + static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag); + + // Parse the input string as a language tag. Reports an error to the context + // if the input can't be parsed completely. + static bool parse(JSContext* cx, mozilla::Span locale, + LanguageTag& tag); + + // Parse the input string as a language tag. Returns Ok(true) if the input + // could be completely parsed, Ok(false) if the input couldn't be parsed, + // or Err() in case of internal error. + static JS::Result tryParse(JSContext* cx, JSLinearString* locale, + LanguageTag& tag); + + // Parse the input string as a language tag. Returns Ok(true) if the input + // could be completely parsed, Ok(false) if the input couldn't be parsed, + // or Err() in case of internal error. + static JS::Result tryParse(JSContext* cx, + mozilla::Span locale, + LanguageTag& tag); + + // Parse the input string as the base-name parts (language, script, region, + // variants) of a language tag. Ignores any trailing characters. + static bool parseBaseName(JSContext* cx, mozilla::Span locale, + LanguageTag& tag); + + // Parse the input string as the base-name parts (language, script, region, + // variants) of a language tag. Returns Ok(true) if the input could be + // completely parsed, Ok(false) if the input couldn't be parsed, or Err() in + // case of internal error. + static JS::Result tryParseBaseName(JSContext* cx, + JSLinearString* locale, + LanguageTag& tag); + + // Return true iff |extension| can be parsed as a Unicode extension subtag. + static bool canParseUnicodeExtension(mozilla::Span extension); + + // Return true iff |unicodeType| can be parsed as a Unicode extension type. + static bool canParseUnicodeExtensionType(JSLinearString* unicodeType); +}; + +MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind) /** * Parse a string as a standalone |language| tag. If |str| is a standalone * language tag, store it in |result| and return true. Otherwise return false. */ -[[nodiscard]] bool ParseStandaloneLanguageTag( - JS::Handle str, mozilla::intl::LanguageSubtag& result); +[[nodiscard]] bool ParseStandaloneLanguageTag(JS::Handle str, + LanguageSubtag& result); /** * Parse a string as a standalone |script| tag. If |str| is a standalone script * tag, store it in |result| and return true. Otherwise return false. */ -[[nodiscard]] bool ParseStandaloneScriptTag( - JS::Handle str, mozilla::intl::ScriptSubtag& result); +[[nodiscard]] bool ParseStandaloneScriptTag(JS::Handle str, + ScriptSubtag& result); /** * Parse a string as a standalone |region| tag. If |str| is a standalone region * tag, store it in |result| and return true. Otherwise return false. */ -[[nodiscard]] bool ParseStandaloneRegionTag( - JS::Handle str, mozilla::intl::RegionSubtag& result); +[[nodiscard]] bool ParseStandaloneRegionTag(JS::Handle str, + RegionSubtag& result); /** * Parse a string as an ISO-639 language code. Return |nullptr| in the result if @@ -65,15 +744,13 @@ JS::Result ParseStandaloneISO639LanguageTag( JSContext* cx, JS::Handle str); class UnicodeExtensionKeyword final { - char key_[mozilla::intl::LanguageTagLimits::UnicodeKeyLength]; + char key_[LanguageTagLimits::UnicodeKeyLength]; JSLinearString* type_; public: - using UnicodeKey = - const char (&)[mozilla::intl::LanguageTagLimits::UnicodeKeyLength + 1]; + using UnicodeKey = const char (&)[LanguageTagLimits::UnicodeKeyLength + 1]; using UnicodeKeySpan = - mozilla::Span; + mozilla::Span; UnicodeExtensionKeyword(UnicodeKey key, JSLinearString* type) : key_{key[0], key[1]}, type_(type) {} @@ -85,7 +762,7 @@ class UnicodeExtensionKeyword final { }; [[nodiscard]] extern bool ApplyUnicodeExtensionToTag( - JSContext* cx, mozilla::intl::Locale& tag, + JSContext* cx, LanguageTag& tag, JS::HandleVector keywords); } // namespace intl diff --git a/intl/components/src/LocaleGenerated.cpp b/js/src/builtin/intl/LanguageTagGenerated.cpp similarity index 84% rename from intl/components/src/LocaleGenerated.cpp rename to js/src/builtin/intl/LanguageTagGenerated.cpp index fd7627a8c6ca..3aa3a0f8b5b2 100644 --- a/intl/components/src/LocaleGenerated.cpp +++ b/js/src/builtin/intl/LanguageTagGenerated.cpp @@ -13,36 +13,39 @@ #include #include -#include "mozilla/intl/Locale.h" +#include "builtin/intl/LanguageTag.h" +#include "util/Text.h" +#include "vm/JSContext.h" -using namespace mozilla::intl::LanguageTagLimits; +using namespace js::intl::LanguageTagLimits; template static inline bool HasReplacement( const char (&subtags)[Length][TagLength], - const mozilla::intl::LanguageTagSubtag& subtag) { + const js::intl::LanguageTagSubtag& subtag) { MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); const char* ptr = subtag.span().data(); return std::binary_search(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { - return memcmp(a, b, TagLength - 1) < 0; - }); + return memcmp(a, b, TagLength - 1) < 0; + }); } template static inline const char* SearchReplacement( - const char (&subtags)[Length][TagLength], const char* (&aliases)[Length], - const mozilla::intl::LanguageTagSubtag& subtag) { + const char (&subtags)[Length][TagLength], + const char* (&aliases)[Length], + const js::intl::LanguageTagSubtag& subtag) { MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); const char* ptr = subtag.span().data(); auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { - return memcmp(a, b, TagLength - 1) < 0; - }); + return memcmp(a, b, TagLength - 1) < 0; + }); if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { return aliases[std::distance(std::begin(subtags), p)]; } @@ -59,23 +62,32 @@ static bool IsAsciiLowercaseAlphanumericOrDash(char c) { } static bool IsCanonicallyCasedLanguageTag(mozilla::Span span) { - return std::all_of(span.begin(), span.end(), - mozilla::IsAsciiLowercaseAlpha); + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha); } static bool IsCanonicallyCasedScriptTag(mozilla::Span span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + return mozilla::IsAsciiUppercaseAlpha(span[0]) && - std::all_of(span.begin() + 1, span.end(), - mozilla::IsAsciiLowercaseAlpha); + std::all_of(span.begin() + 1, span.end(), mozilla::IsAsciiLowercaseAlpha); } static bool IsCanonicallyCasedRegionTag(mozilla::Span span) { - return std::all_of(span.begin(), span.end(), - mozilla::IsAsciiUppercaseAlpha) || + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha) || std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit); } static bool IsCanonicallyCasedVariantTag(mozilla::Span span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); } @@ -84,8 +96,7 @@ static bool IsCanonicallyCasedUnicodeKey(mozilla::Span key) { } static bool IsCanonicallyCasedUnicodeType(mozilla::Span type) { - return std::all_of(type.begin(), type.end(), - IsAsciiLowercaseAlphanumericOrDash); + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); } static bool IsCanonicallyCasedTransformKey(mozilla::Span key) { @@ -93,15 +104,14 @@ static bool IsCanonicallyCasedTransformKey(mozilla::Span key) { } static bool IsCanonicallyCasedTransformType(mozilla::Span type) { - return std::all_of(type.begin(), type.end(), - IsAsciiLowercaseAlphanumericOrDash); + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); } #endif // Mappings from language subtags to preferred values. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::languageMapping(LanguageSubtag& language) { +bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) { MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.span())); @@ -221,7 +231,7 @@ bool mozilla::intl::Locale::languageMapping(LanguageSubtag& language) { // Language subtags with complex mappings. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::complexLanguageMapping(const LanguageSubtag& language) { +bool js::intl::LanguageTag::complexLanguageMapping(const LanguageSubtag& language) { MOZ_ASSERT(IsStructurallyValidLanguageTag(language.span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.span())); @@ -243,7 +253,7 @@ bool mozilla::intl::Locale::complexLanguageMapping(const LanguageSubtag& languag // Mappings from script subtags to preferred values. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::scriptMapping(ScriptSubtag& script) { +bool js::intl::LanguageTag::scriptMapping(ScriptSubtag& script) { MOZ_ASSERT(IsStructurallyValidScriptTag(script.span())); MOZ_ASSERT(IsCanonicallyCasedScriptTag(script.span())); @@ -259,7 +269,7 @@ bool mozilla::intl::Locale::scriptMapping(ScriptSubtag& script) { // Mappings from region subtags to preferred values. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::regionMapping(RegionSubtag& region) { +bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) { MOZ_ASSERT(IsStructurallyValidRegionTag(region.span())); MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.span())); @@ -359,7 +369,7 @@ bool mozilla::intl::Locale::regionMapping(RegionSubtag& region) { // Region subtags with complex mappings. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::complexRegionMapping(const RegionSubtag& region) { +bool js::intl::LanguageTag::complexRegionMapping(const RegionSubtag& region) { MOZ_ASSERT(IsStructurallyValidRegionTag(region.span())); MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.span())); @@ -382,7 +392,7 @@ bool mozilla::intl::Locale::complexRegionMapping(const RegionSubtag& region) { // Language subtags with complex mappings. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -void mozilla::intl::Locale::performComplexLanguageMappings() { +void js::intl::LanguageTag::performComplexLanguageMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); @@ -418,7 +428,7 @@ void mozilla::intl::Locale::performComplexLanguageMappings() { // Region subtags with complex mappings. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -void mozilla::intl::Locale::performComplexRegionMappings() { +void js::intl::LanguageTag::performComplexRegionMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); MOZ_ASSERT(IsStructurallyValidRegionTag(region().span())); @@ -624,7 +634,7 @@ static const char* ToCharPointer(const char* str) { return str; } -static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) { +static const char* ToCharPointer(const js::UniqueChars& str) { return str.get(); } @@ -636,7 +646,7 @@ static bool IsLessThan(const T& a, const U& b) { // Mappings from variant subtags to preferred values. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::performVariantMappings() { +bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) { // The variant subtags need to be sorted for binary search. MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), IsLessThan)); @@ -646,9 +656,9 @@ bool mozilla::intl::Locale::performVariantMappings() { }; auto insertVariantSortedIfNotPresent = [&](const char* variant) { - auto* p = std::lower_bound( - variants_.begin(), variants_.end(), variant, - IsLessThan); + auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, + IsLessThan); // Don't insert the replacement when already present. if (p != variants_.end() && strcmp(p->get(), variant) == 0) { @@ -656,11 +666,14 @@ bool mozilla::intl::Locale::performVariantMappings() { } // Insert the preferred variant in sort order. - auto preferred = DuplicateStringToUniqueChars(variant); + auto preferred = DuplicateString(cx, variant); + if (!preferred) { + return false; + } return !!variants_.insert(p, std::move(preferred)); }; - for (size_t i = 0; i < variants_.length();) { + for (size_t i = 0; i < variants_.length(); ) { const char* variant = variants_[i].get(); MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant))); @@ -700,7 +713,7 @@ bool mozilla::intl::Locale::performVariantMappings() { // Canonicalize legacy locale identifiers. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::updateLegacyMappings() { +bool js::intl::LanguageTag::updateLegacyMappings(JSContext* cx) { // We're mapping legacy tags to non-legacy form here. // Other tags remain unchanged. // @@ -715,10 +728,8 @@ bool mozilla::intl::Locale::updateLegacyMappings() { } for ([[maybe_unused]] const auto& variant : variants()) { - MOZ_ASSERT( - IsStructurallyValidVariantTag(mozilla::MakeStringSpan(variant.get()))); - MOZ_ASSERT( - IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get()))); + MOZ_ASSERT(IsStructurallyValidVariantTag(mozilla::MakeStringSpan(variant.get()))); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get()))); } // The variant subtags need to be sorted for binary search. @@ -747,7 +758,10 @@ bool mozilla::intl::Locale::updateLegacyMappings() { } // Insert the preferred variant in sort order. - auto preferred = DuplicateStringToUniqueChars(variant); + auto preferred = DuplicateString(cx, variant); + if (!preferred) { + return false; + } return !!variants_.insert(p, std::move(preferred)); }; @@ -858,7 +872,7 @@ bool mozilla::intl::Locale::updateLegacyMappings() { // Mappings from legacy sign languages. // Derived from CLDR Supplemental Data, version 39. // https://unicode.org/Public/cldr/39/core.zip -bool mozilla::intl::Locale::signLanguageMapping(LanguageSubtag& language, +bool js::intl::LanguageTag::signLanguageMapping(LanguageSubtag& language, const RegionSubtag& region) { MOZ_ASSERT(language.equalTo("sgn")); MOZ_ASSERT(IsStructurallyValidRegionTag(region.span())); @@ -904,14 +918,16 @@ bool mozilla::intl::Locale::signLanguageMapping(LanguageSubtag& language, } template -static inline bool IsUnicodeKey(mozilla::Span key, const char (&str)[Length]) { +static inline bool IsUnicodeKey( + mozilla::Span key, const char (&str)[Length]) { static_assert(Length == UnicodeKeyLength + 1, "Unicode extension key is two characters long"); return memcmp(key.data(), str, Length - 1) == 0; } template -static inline bool IsUnicodeType(mozilla::Span type, const char (&str)[Length]) { +static inline bool IsUnicodeType( + mozilla::Span type, const char (&str)[Length]) { static_assert(Length > UnicodeKeyLength + 1, "Unicode extension type contains more than two characters"); return type.size() == (Length - 1) && @@ -944,8 +960,8 @@ static inline const char* SearchUnicodeReplacement( auto p = std::lower_bound(std::begin(types), std::end(types), type, [](const auto& a, const auto& b) { - return CompareUnicodeType(a, b) < 0; - }); + return CompareUnicodeType(a, b) < 0; + }); if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) { return aliases[std::distance(std::begin(types), p)]; } @@ -959,7 +975,7 @@ static inline const char* SearchUnicodeReplacement( * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files * Spec: https://www.unicode.org/reports/tr35/#t_Extension */ -const char* mozilla::intl::Locale::replaceUnicodeExtensionType( +const char* js::intl::LanguageTag::replaceUnicodeExtensionType( mozilla::Span key, mozilla::Span type) { MOZ_ASSERT(key.size() == UnicodeKeyLength); MOZ_ASSERT(IsCanonicallyCasedUnicodeKey(key)); @@ -1000,67 +1016,67 @@ const char* mozilla::intl::Locale::replaceUnicodeExtensionType( else if (IsUnicodeKey(key, "rg") || IsUnicodeKey(key, "sd")) { static const char* types[144] = { - "cn11" , "cn12" , "cn13" , "cn14" , "cn15" , "cn21" , "cn22" , - "cn23" , "cn31" , "cn32" , "cn33" , "cn34" , "cn35" , "cn36" , - "cn37" , "cn41" , "cn42" , "cn43" , "cn44" , "cn45" , "cn46" , - "cn50" , "cn51" , "cn52" , "cn53" , "cn54" , "cn61" , "cn62" , - "cn63" , "cn64" , "cn65" , "cn71" , "cn91" , "cn92" , "cz10a" , - "cz10b" , "cz10c" , "cz10d" , "cz10e" , "cz10f" , "cz611" , "cz612" , - "cz613" , "cz614" , "cz615" , "cz621" , "cz622" , "cz623" , "cz624" , - "cz626" , "cz627" , "czjc" , "czjm" , "czka" , "czkr" , "czli" , - "czmo" , "czol" , "czpa" , "czpl" , "czpr" , "czst" , "czus" , - "czvy" , "czzl" , "fi01" , "fra" , "frb" , "frbl" , "frc" , - "frcp" , "frd" , "fre" , "frf" , "frg" , "frgf" , "frgp" , - "frh" , "fri" , "frj" , "frk" , "frl" , "frm" , "frmf" , - "frmq" , "frn" , "frnc" , "fro" , "frp" , "frpf" , "frpm" , - "frq" , "frr" , "frre" , "frs" , "frt" , "frtf" , "fru" , - "frv" , "frwf" , "fryt" , "laxn" , "lud" , "lug" , "lul" , - "mrnkc" , "nlaw" , "nlcw" , "nlsx" , "no23" , "nzn" , "nzs" , - "omba" , "omsh" , "plds" , "plkp" , "pllb" , "plld" , "pllu" , - "plma" , "plmz" , "plop" , "plpd" , "plpk" , "plpm" , "plsk" , - "plsl" , "plwn" , "plwp" , "plzp" , "shta" , "tteto" , "ttrcm" , - "ttwto" , "twkhq" , "twtnq" , "twtpq" , "twtxq" , "usas" , "usgu" , - "usmp" , "uspr" , "usum" , "usvi" , + "cn11", "cn12", "cn13", "cn14", "cn15", "cn21", "cn22", + "cn23", "cn31", "cn32", "cn33", "cn34", "cn35", "cn36", + "cn37", "cn41", "cn42", "cn43", "cn44", "cn45", "cn46", + "cn50", "cn51", "cn52", "cn53", "cn54", "cn61", "cn62", + "cn63", "cn64", "cn65", "cn71", "cn91", "cn92", "cz10a", + "cz10b", "cz10c", "cz10d", "cz10e", "cz10f", "cz611", "cz612", + "cz613", "cz614", "cz615", "cz621", "cz622", "cz623", "cz624", + "cz626", "cz627", "czjc", "czjm", "czka", "czkr", "czli", + "czmo", "czol", "czpa", "czpl", "czpr", "czst", "czus", + "czvy", "czzl", "fi01", "fra", "frb", "frbl", "frc", + "frcp", "frd", "fre", "frf", "frg", "frgf", "frgp", + "frh", "fri", "frj", "frk", "frl", "frm", "frmf", + "frmq", "frn", "frnc", "fro", "frp", "frpf", "frpm", + "frq", "frr", "frre", "frs", "frt", "frtf", "fru", + "frv", "frwf", "fryt", "laxn", "lud", "lug", "lul", + "mrnkc", "nlaw", "nlcw", "nlsx", "no23", "nzn", "nzs", + "omba", "omsh", "plds", "plkp", "pllb", "plld", "pllu", + "plma", "plmz", "plop", "plpd", "plpk", "plpm", "plsk", + "plsl", "plwn", "plwp", "plzp", "shta", "tteto", "ttrcm", + "ttwto", "twkhq", "twtnq", "twtpq", "twtxq", "usas", "usgu", + "usmp", "uspr", "usum", "usvi", }; static const char* aliases[144] = { - "cnbj" , "cntj" , "cnhe" , "cnsx" , "cnmn" , "cnln" , "cnjl" , - "cnhl" , "cnsh" , "cnjs" , "cnzj" , "cnah" , "cnfj" , "cnjx" , - "cnsd" , "cnha" , "cnhb" , "cnhn" , "cngd" , "cngx" , "cnhi" , - "cncq" , "cnsc" , "cngz" , "cnyn" , "cnxz" , "cnsn" , "cngs" , - "cnqh" , "cnnx" , "cnxj" , "twzzzz", "hkzzzz", "mozzzz", "cz110" , - "cz111" , "cz112" , "cz113" , "cz114" , "cz115" , "cz663" , "cz632" , - "cz633" , "cz634" , "cz635" , "cz641" , "cz642" , "cz643" , "cz644" , - "cz646" , "cz647" , "cz31" , "cz64" , "cz41" , "cz52" , "cz51" , - "cz80" , "cz71" , "cz53" , "cz32" , "cz10" , "cz20" , "cz42" , - "cz63" , "cz72" , "axzzzz", "frges" , "frnaq" , "blzzzz", "frara" , - "cpzzzz", "frbfc" , "frbre" , "frcvl" , "frges" , "gfzzzz", "gpzzzz", - "frcor" , "frbfc" , "fridf" , "frocc" , "frnaq" , "frges" , "mfzzzz", - "mqzzzz", "frocc" , "nczzzz", "frhdf" , "frnor" , "pfzzzz", "pmzzzz", - "frnor" , "frpdl" , "rezzzz", "frhdf" , "frnaq" , "tfzzzz", "frpac" , - "frara" , "wfzzzz", "ytzzzz", "laxs" , "lucl" , "luec" , "luca" , - "mr13" , "awzzzz", "cwzzzz", "sxzzzz", "no50" , "nzauk" , "nzcan" , - "ombj" , "omsj" , "pl02" , "pl04" , "pl08" , "pl10" , "pl06" , - "pl12" , "pl14" , "pl16" , "pl20" , "pl18" , "pl22" , "pl26" , - "pl24" , "pl28" , "pl30" , "pl32" , "tazzzz", "tttob" , "ttmrc" , - "tttob" , "twkhh" , "twtnn" , "twnwt" , "twtxg" , "aszzzz", "guzzzz", - "mpzzzz", "przzzz", "umzzzz", "vizzzz", + "cnbj", "cntj", "cnhe", "cnsx", "cnmn", "cnln", "cnjl", + "cnhl", "cnsh", "cnjs", "cnzj", "cnah", "cnfj", "cnjx", + "cnsd", "cnha", "cnhb", "cnhn", "cngd", "cngx", "cnhi", + "cncq", "cnsc", "cngz", "cnyn", "cnxz", "cnsn", "cngs", + "cnqh", "cnnx", "cnxj", "twzzzz", "hkzzzz", "mozzzz", "cz110", + "cz111", "cz112", "cz113", "cz114", "cz115", "cz663", "cz632", + "cz633", "cz634", "cz635", "cz641", "cz642", "cz643", "cz644", + "cz646", "cz647", "cz31", "cz64", "cz41", "cz52", "cz51", + "cz80", "cz71", "cz53", "cz32", "cz10", "cz20", "cz42", + "cz63", "cz72", "axzzzz", "frges", "frnaq", "blzzzz", "frara", + "cpzzzz", "frbfc", "frbre", "frcvl", "frges", "gfzzzz", "gpzzzz", + "frcor", "frbfc", "fridf", "frocc", "frnaq", "frges", "mfzzzz", + "mqzzzz", "frocc", "nczzzz", "frhdf", "frnor", "pfzzzz", "pmzzzz", + "frnor", "frpdl", "rezzzz", "frhdf", "frnaq", "tfzzzz", "frpac", + "frara", "wfzzzz", "ytzzzz", "laxs", "lucl", "luec", "luca", + "mr13", "awzzzz", "cwzzzz", "sxzzzz", "no50", "nzauk", "nzcan", + "ombj", "omsj", "pl02", "pl04", "pl08", "pl10", "pl06", + "pl12", "pl14", "pl16", "pl20", "pl18", "pl22", "pl26", + "pl24", "pl28", "pl30", "pl32", "tazzzz", "tttob", "ttmrc", + "tttob", "twkhh", "twtnn", "twnwt", "twtxg", "aszzzz", "guzzzz", + "mpzzzz", "przzzz", "umzzzz", "vizzzz", }; return SearchUnicodeReplacement(types, aliases, type); } else if (IsUnicodeKey(key, "tz")) { static const char* types[28] = { - "aqams" , "cnckg" , "cnhrb" , "cnkhg" , "cuba" , "egypt" , - "eire" , "est" , "gmt0" , "hongkong", "hst" , "iceland" , - "iran" , "israel" , "jamaica" , "japan" , "libya" , "mst" , - "navajo" , "poland" , "portugal", "prc" , "roc" , "rok" , - "turkey" , "uct" , "usnavajo", "zulu" , + "aqams", "cnckg", "cnhrb", "cnkhg", "cuba", "egypt", + "eire", "est", "gmt0", "hongkong", "hst", "iceland", + "iran", "israel", "jamaica", "japan", "libya", "mst", + "navajo", "poland", "portugal", "prc", "roc", "rok", + "turkey", "uct", "usnavajo", "zulu", }; static const char* aliases[28] = { - "nzakl" , "cnsha" , "cnsha" , "cnurc" , "cuhav" , "egcai" , - "iedub" , "utcw05" , "gmt" , "hkhkg" , "utcw10" , "isrey" , - "irthr" , "jeruslm" , "jmkin" , "jptyo" , "lytip" , "utcw07" , - "usden" , "plwaw" , "ptlis" , "cnsha" , "twtpe" , "krsel" , - "trist" , "utc" , "usden" , "utc" , + "nzakl", "cnsha", "cnsha", "cnurc", "cuhav", "egcai", + "iedub", "utcw05", "gmt", "hkhkg", "utcw10", "isrey", + "irthr", "jeruslm", "jmkin", "jptyo", "lytip", "utcw07", + "usden", "plwaw", "ptlis", "cnsha", "twtpe", "krsel", + "trist", "utc", "usden", "utc", }; return SearchUnicodeReplacement(types, aliases, type); } @@ -1068,14 +1084,16 @@ const char* mozilla::intl::Locale::replaceUnicodeExtensionType( } template -static inline bool IsTransformKey(mozilla::Span key, const char (&str)[Length]) { +static inline bool IsTransformKey( + mozilla::Span key, const char (&str)[Length]) { static_assert(Length == TransformKeyLength + 1, "Transform extension key is two characters long"); return memcmp(key.data(), str, Length - 1) == 0; } template -static inline bool IsTransformType(mozilla::Span type, const char (&str)[Length]) { +static inline bool IsTransformType( + mozilla::Span type, const char (&str)[Length]) { static_assert(Length > TransformKeyLength + 1, "Transform extension type contains more than two characters"); return type.size() == (Length - 1) && @@ -1089,7 +1107,7 @@ static inline bool IsTransformType(mozilla::Span type, const char (& * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files * Spec: https://www.unicode.org/reports/tr35/#t_Extension */ -const char* mozilla::intl::Locale::replaceTransformExtensionType( +const char* js::intl::LanguageTag::replaceTransformExtensionType( mozilla::Span key, mozilla::Span type) { MOZ_ASSERT(key.size() == TransformKeyLength); MOZ_ASSERT(IsCanonicallyCasedTransformKey(key)); diff --git a/js/src/builtin/intl/Locale.cpp b/js/src/builtin/intl/Locale.cpp index f9dd46fa6dfc..151fde560c88 100644 --- a/js/src/builtin/intl/Locale.cpp +++ b/js/src/builtin/intl/Locale.cpp @@ -11,7 +11,6 @@ #include "mozilla/ArrayUtils.h" #include "mozilla/Assertions.h" #include "mozilla/Casting.h" -#include "mozilla/intl/Locale.h" #include "mozilla/Maybe.h" #include "mozilla/Span.h" #include "mozilla/TextUtils.h" @@ -24,9 +23,7 @@ #include "builtin/Boolean.h" #include "builtin/intl/CommonFunctions.h" -#include "builtin/intl/FormatBuffer.h" #include "builtin/intl/LanguageTag.h" -#include "builtin/intl/StringAsciiChars.h" #include "builtin/String.h" #include "gc/Rooting.h" #include "js/Conversions.h" @@ -45,7 +42,10 @@ #include "vm/NativeObject-inl.h" using namespace js; -using namespace mozilla::intl::LanguageTagLimits; +using namespace js::intl::LanguageTagLimits; + +using intl::LanguageTag; +using intl::LanguageTagParser; const JSClass LocaleObject::class_ = { "Intl.Locale", @@ -60,7 +60,7 @@ static inline bool IsLocale(HandleValue v) { } // Return the length of the base-name subtags. -static size_t BaseNameLength(const mozilla::intl::Locale& tag) { +static size_t BaseNameLength(const LanguageTag& tag) { size_t baseNameLength = tag.language().length(); if (tag.script().present()) { baseNameLength += 1 + tag.script().length(); @@ -88,7 +88,7 @@ struct IndexAndLength { // Compute the Unicode extension's index and length in the extension subtag. static mozilla::Maybe UnicodeExtensionPosition( - const mozilla::intl::Locale& tag) { + const LanguageTag& tag) { size_t index = 0; for (const auto& extension : tag.extensions()) { MOZ_ASSERT(!mozilla::IsAsciiUppercaseAlpha(extension[0]), @@ -106,14 +106,8 @@ static mozilla::Maybe UnicodeExtensionPosition( } static LocaleObject* CreateLocaleObject(JSContext* cx, HandleObject prototype, - const mozilla::intl::Locale& tag) { - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - - RootedString tagStr(cx, buffer.toString(cx)); + const LanguageTag& tag) { + RootedString tagStr(cx, tag.toString(cx)); if (!tagStr) { return nullptr; } @@ -148,27 +142,9 @@ static LocaleObject* CreateLocaleObject(JSContext* cx, HandleObject prototype, return locale; } -static inline bool IsValidUnicodeExtensionValue(JSContext* cx, - JSLinearString* linear, - bool* isValid) { - if (linear->length() == 0) { - *isValid = false; - return true; - } - - if (!StringIsAscii(linear)) { - *isValid = false; - return true; - } - - intl::StringAsciiChars chars(linear); - if (!chars.init(cx)) { - return false; - } - - *isValid = - mozilla::intl::LocaleParser::canParseUnicodeExtensionType(chars).isOk(); - return true; +static inline bool IsValidUnicodeExtensionValue(JSLinearString* linear) { + return linear->length() > 0 && + LanguageTagParser::canParseUnicodeExtensionType(linear); } /** Iterate through (sep keyword) in a valid, lowercased Unicode extension. */ @@ -298,7 +274,7 @@ static bool GetBooleanOption(JSContext* cx, HandleObject options, /** * ApplyOptionsToTag ( tag, options ) */ -static bool ApplyOptionsToTag(JSContext* cx, mozilla::intl::Locale& tag, +static bool ApplyOptionsToTag(JSContext* cx, LanguageTag& tag, HandleObject options) { // Steps 1-2 (Already performed in caller). @@ -310,7 +286,7 @@ static bool ApplyOptionsToTag(JSContext* cx, mozilla::intl::Locale& tag, } // Step 4. - mozilla::intl::LanguageSubtag language; + intl::LanguageSubtag language; if (option && !intl::ParseStandaloneLanguageTag(option, language)) { if (UniqueChars str = QuoteString(cx, option, '"')) { JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, @@ -326,7 +302,7 @@ static bool ApplyOptionsToTag(JSContext* cx, mozilla::intl::Locale& tag, } // Step 6. - mozilla::intl::ScriptSubtag script; + intl::ScriptSubtag script; if (option && !intl::ParseStandaloneScriptTag(option, script)) { if (UniqueChars str = QuoteString(cx, option, '"')) { JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, @@ -342,7 +318,7 @@ static bool ApplyOptionsToTag(JSContext* cx, mozilla::intl::Locale& tag, } // Step 8. - mozilla::intl::RegionSubtag region; + intl::RegionSubtag region; if (option && !intl::ParseStandaloneRegionTag(option, region)) { if (UniqueChars str = QuoteString(cx, option, '"')) { JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, @@ -374,16 +350,8 @@ static bool ApplyOptionsToTag(JSContext* cx, mozilla::intl::Locale& tag, // Step 13. // Optimized to only canonicalize the base-name subtags. All other // canonicalization steps will happen later. - auto result = tag.canonicalizeBaseName(); - if (result.isErr()) { - if (result.unwrapErr() == - mozilla::intl::Locale::CanonicalizationError::DuplicateVariant) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_DUPLICATE_VARIANT_SUBTAG); - } else { - intl::ReportInternalError(cx); - } - return false; + if (!tag.canonicalizeBaseName(cx)) { + return true; } } @@ -394,7 +362,7 @@ static bool ApplyOptionsToTag(JSContext* cx, mozilla::intl::Locale& tag, * ApplyUnicodeExtensionToTag( tag, options, relevantExtensionKeys ) */ bool js::intl::ApplyUnicodeExtensionToTag( - JSContext* cx, mozilla::intl::Locale& tag, + JSContext* cx, LanguageTag& tag, JS::HandleVector keywords) { // If no Unicode extensions were present in the options object, we can skip // everything below and directly return. @@ -469,12 +437,12 @@ bool js::intl::ApplyUnicodeExtensionToTag( return false; } - if (!tag.setUnicodeExtension(newExtension.begin())) { - intl::ReportInternalError(cx); + // Insert the new Unicode extension string into the language tag. + UniqueChars newExtensionChars(newExtension.extractOrCopyRawBuffer()); + if (!newExtensionChars) { return false; } - - return true; + return tag.setUnicodeExtension(std::move(newExtensionChars)); } static JS::Result LanguageTagFromMaybeWrappedLocale(JSContext* cx, @@ -553,19 +521,12 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { } // ApplyOptionsToTag, steps 2 and 9. - mozilla::intl::Locale tag; - if (!intl::ParseLocale(cx, tagLinearStr, tag)) { + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) { return false; } - if (auto result = tag.canonicalizeBaseName(); result.isErr()) { - if (result.unwrapErr() == - mozilla::intl::Locale::CanonicalizationError::DuplicateVariant) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_DUPLICATE_VARIANT_SUBTAG); - } else { - intl::ReportInternalError(cx); - } + if (!tag.canonicalizeBaseName(cx)) { return false; } @@ -586,12 +547,7 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { // Steps 15-16. if (calendar) { - bool isValid; - if (!IsValidUnicodeExtensionValue(cx, calendar, &isValid)) { - return false; - } - - if (!isValid) { + if (!IsValidUnicodeExtensionValue(calendar)) { if (UniqueChars str = QuoteString(cx, calendar, '"')) { JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, JSMSG_INVALID_OPTION_VALUE, "calendar", @@ -613,12 +569,7 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { // Steps 18-19. if (collation) { - bool isValid; - if (!IsValidUnicodeExtensionValue(cx, collation, &isValid)) { - return false; - } - - if (!isValid) { + if (!IsValidUnicodeExtensionValue(collation)) { if (UniqueChars str = QuoteString(cx, collation, '"')) { JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, JSMSG_INVALID_OPTION_VALUE, "collation", @@ -703,11 +654,7 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { // Steps 28-29. if (numberingSystem) { - bool isValid; - if (!IsValidUnicodeExtensionValue(cx, numberingSystem, &isValid)) { - return false; - } - if (!isValid) { + if (!IsValidUnicodeExtensionValue(numberingSystem)) { if (UniqueChars str = QuoteString(cx, numberingSystem, '"')) { JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, JSMSG_INVALID_OPTION_VALUE, @@ -729,14 +676,7 @@ static bool Locale(JSContext* cx, unsigned argc, Value* vp) { // ApplyOptionsToTag, steps 9 and 13. // ApplyUnicodeExtensionToTag, step 9. - if (auto result = tag.canonicalizeExtensions(); result.isErr()) { - if (result.unwrapErr() == - mozilla::intl::Locale::CanonicalizationError::DuplicateVariant) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_DUPLICATE_VARIANT_SUBTAG); - } else { - intl::ReportInternalError(cx); - } + if (!tag.canonicalizeExtensions(cx)) { return false; } @@ -900,25 +840,19 @@ static BaseNamePartsResult BaseNameParts(const CharT* baseName, size_t length) { languageLength = length; } - // Tell the analysis the |IsStructurallyValid*Tag| functions can't GC. - JS::AutoSuppressGCAnalysis nogc; - IndexAndLength language{0, languageLength}; - MOZ_ASSERT( - mozilla::intl::IsStructurallyValidLanguageTag(language.spanOf(baseName))); + MOZ_ASSERT(intl::IsStructurallyValidLanguageTag(language.spanOf(baseName))); mozilla::Maybe script{}; if (scriptIndex) { script.emplace(scriptIndex, ScriptLength); - MOZ_ASSERT( - mozilla::intl::IsStructurallyValidScriptTag(script->spanOf(baseName))); + MOZ_ASSERT(intl::IsStructurallyValidScriptTag(script->spanOf(baseName))); } mozilla::Maybe region{}; if (regionIndex) { region.emplace(regionIndex, regionLength); - MOZ_ASSERT( - mozilla::intl::IsStructurallyValidRegionTag(region->spanOf(baseName))); + MOZ_ASSERT(intl::IsStructurallyValidRegionTag(region->spanOf(baseName))); } return {language, script, region}; @@ -942,13 +876,12 @@ static bool Locale_maximize(JSContext* cx, const CallArgs& args) { return false; } - mozilla::intl::Locale tag; - if (!intl::ParseLocale(cx, tagStr, tag)) { + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagStr, tag)) { return false; } - if (!tag.addLikelySubtags()) { - intl::ReportInternalError(cx); + if (!tag.addLikelySubtags(cx)) { return false; } @@ -979,13 +912,12 @@ static bool Locale_minimize(JSContext* cx, const CallArgs& args) { return false; } - mozilla::intl::Locale tag; - if (!intl::ParseLocale(cx, tagStr, tag)) { + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagStr, tag)) { return false; } - if (!tag.removeLikelySubtags()) { - intl::ReportInternalError(cx); + if (!tag.removeLikelySubtags(cx)) { return false; } @@ -1348,34 +1280,19 @@ bool js::intl_ValidateAndCanonicalizeLanguageTag(JSContext* cx, unsigned argc, return true; } - mozilla::intl::Locale tag; - if (!intl::ParseLocale(cx, tagLinearStr, tag)) { + LanguageTag tag(cx); + if (!LanguageTagParser::parse(cx, tagLinearStr, tag)) { return false; } - auto result = tag.canonicalize(); - if (result.isErr()) { - if (result.unwrapErr() == - mozilla::intl::Locale::CanonicalizationError::DuplicateVariant) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_DUPLICATE_VARIANT_SUBTAG); - } else { - intl::ReportInternalError(cx); - } + if (!tag.canonicalize(cx)) { return false; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return false; - } - - JSString* resultStr = buffer.toString(cx); + JSString* resultStr = tag.toString(cx); if (!resultStr) { return false; } - args.rval().setString(resultStr); return true; } @@ -1390,45 +1307,22 @@ bool js::intl_TryValidateAndCanonicalizeLanguageTag(JSContext* cx, return false; } - mozilla::intl::Locale tag; - { - if (!StringIsAscii(linear)) { - // The caller handles invalid inputs. - args.rval().setNull(); - return true; - } + LanguageTag tag(cx); + bool ok; + JS_TRY_VAR_OR_RETURN_FALSE(cx, ok, + LanguageTagParser::tryParse(cx, linear, tag)); - intl::StringAsciiChars chars(linear); - if (!chars.init(cx)) { - return false; - } - - if (mozilla::intl::LocaleParser::tryParse(chars, tag).isErr()) { - // The caller handles invalid inputs. - args.rval().setNull(); - return true; - } + // The caller handles invalid inputs. + if (!ok) { + args.rval().setNull(); + return true; } - auto result = tag.canonicalize(); - if (result.isErr()) { - if (result.unwrapErr() == - mozilla::intl::Locale::CanonicalizationError::DuplicateVariant) { - JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, - JSMSG_DUPLICATE_VARIANT_SUBTAG); - } else { - intl::ReportInternalError(cx); - } + if (!tag.canonicalize(cx)) { return false; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return false; - } - - JSString* resultStr = buffer.toString(cx); + JSString* resultStr = tag.toString(cx); if (!resultStr) { return false; } @@ -1456,11 +1350,7 @@ bool js::intl_ValidateAndCanonicalizeUnicodeExtensionType(JSContext* cx, return false; } - bool isValid; - if (!IsValidUnicodeExtensionValue(cx, unicodeType, &isValid)) { - return false; - } - if (!isValid) { + if (!IsValidUnicodeExtensionValue(unicodeType)) { UniqueChars optionChars = EncodeAscii(cx, optionArg.toString()); if (!optionChars) { return false; @@ -1501,8 +1391,8 @@ bool js::intl_ValidateAndCanonicalizeUnicodeExtensionType(JSContext* cx, MOZ_ASSERT(strlen(unicodeTypeChars.get()) == unicodeTypeLength); // Convert into canonical case before searching for replacements. - mozilla::intl::AsciiToLowerCase(unicodeTypeChars.get(), unicodeTypeLength, - unicodeTypeChars.get()); + intl::AsciiToLowerCase(unicodeTypeChars.get(), unicodeTypeLength, + unicodeTypeChars.get()); auto key = mozilla::Span(unicodeKey, UnicodeKeyLength); auto type = mozilla::Span(unicodeTypeChars.get(), unicodeTypeLength); @@ -1510,7 +1400,7 @@ bool js::intl_ValidateAndCanonicalizeUnicodeExtensionType(JSContext* cx, // Search if there's a replacement for the current Unicode keyword. JSString* result; if (const char* replacement = - mozilla::intl::Locale::replaceUnicodeExtensionType(key, type)) { + LanguageTag::replaceUnicodeExtensionType(key, type)) { result = NewStringCopyZ(cx, replacement); } else { result = StringToLowerCase(cx, unicodeType); diff --git a/js/src/builtin/intl/NumberFormat.cpp b/js/src/builtin/intl/NumberFormat.cpp index e76d5f9aedc1..e79524b574e3 100644 --- a/js/src/builtin/intl/NumberFormat.cpp +++ b/js/src/builtin/intl/NumberFormat.cpp @@ -11,7 +11,6 @@ #include "mozilla/Assertions.h" #include "mozilla/Casting.h" #include "mozilla/FloatingPoint.h" -#include "mozilla/intl/Locale.h" #include "mozilla/intl/MeasureUnit.h" #include "mozilla/intl/NumberFormat.h" #include "mozilla/intl/NumberingSystem.h" @@ -32,7 +31,6 @@ #include "builtin/Array.h" #include "builtin/intl/CommonFunctions.h" #include "builtin/intl/DecimalNumber.h" -#include "builtin/intl/FormatBuffer.h" #include "builtin/intl/LanguageTag.h" #include "builtin/intl/MeasureUnitGenerated.h" #include "builtin/intl/RelativeTimeFormat.h" @@ -290,14 +288,14 @@ static UniqueChars NumberFormatLocale(JSContext* cx, HandleObject internals) { // ICU expects numberingSystem as a Unicode locale extensions on locale. - mozilla::intl::Locale tag; + intl::LanguageTag tag(cx); { - RootedLinearString locale(cx, value.toString()->ensureLinear(cx)); + JSLinearString* locale = value.toString()->ensureLinear(cx); if (!locale) { return nullptr; } - if (!intl::ParseLocale(cx, locale, tag)) { + if (!intl::LanguageTagParser::parse(cx, locale, tag)) { return nullptr; } } @@ -328,12 +326,7 @@ static UniqueChars NumberFormatLocale(JSContext* cx, HandleObject internals) { return nullptr; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - return buffer.extractStringZ(); + return tag.toStringZ(cx); } struct NumberFormatOptions : public mozilla::intl::NumberRangeFormatOptions { diff --git a/js/src/builtin/intl/RelativeTimeFormat.cpp b/js/src/builtin/intl/RelativeTimeFormat.cpp index d7b149fb2009..15508625d65e 100644 --- a/js/src/builtin/intl/RelativeTimeFormat.cpp +++ b/js/src/builtin/intl/RelativeTimeFormat.cpp @@ -168,14 +168,14 @@ static mozilla::intl::RelativeTimeFormat* NewRelativeTimeFormatter( // ICU expects numberingSystem as a Unicode locale extensions on locale. - mozilla::intl::Locale tag; + intl::LanguageTag tag(cx); { - RootedLinearString locale(cx, value.toString()->ensureLinear(cx)); + JSLinearString* locale = value.toString()->ensureLinear(cx); if (!locale) { return nullptr; } - if (!intl::ParseLocale(cx, locale, tag)) { + if (!intl::LanguageTagParser::parse(cx, locale, tag)) { return nullptr; } } @@ -206,13 +206,7 @@ static mozilla::intl::RelativeTimeFormat* NewRelativeTimeFormatter( return nullptr; } - intl::FormatBuffer buffer(cx); - if (auto result = tag.toString(buffer); result.isErr()) { - intl::ReportInternalError(cx, result.unwrapErr()); - return nullptr; - } - - UniqueChars locale = buffer.extractStringZ(); + UniqueChars locale = tag.toStringZ(cx); if (!locale) { return nullptr; } diff --git a/js/src/builtin/intl/SharedIntlData.cpp b/js/src/builtin/intl/SharedIntlData.cpp index 22b262fbb72e..6a0e553120f7 100644 --- a/js/src/builtin/intl/SharedIntlData.cpp +++ b/js/src/builtin/intl/SharedIntlData.cpp @@ -377,7 +377,7 @@ bool js::intl::SharedIntlData::getAvailableLocales( // + 4 * Alphanum script subtag // + 1 separator // + 2 * Alpha region subtag - using namespace mozilla::intl::LanguageTagLimits; + using namespace intl::LanguageTagLimits; static constexpr size_t MinLanguageLength = 2; static constexpr size_t MinLengthForScriptAndRegion = MinLanguageLength + 1 + ScriptLength + 1 + AlphaRegionLength; @@ -407,8 +407,7 @@ bool js::intl::SharedIntlData::getAvailableLocales( // Continue with the next locale if we didn't find a script subtag. size_t scriptLength = sep - script; - if (!mozilla::intl::IsStructurallyValidScriptTag( - {script, scriptLength})) { + if (!IsStructurallyValidScriptTag({script, scriptLength})) { continue; } @@ -420,8 +419,7 @@ bool js::intl::SharedIntlData::getAvailableLocales( // Continue with the next locale if we didn't find a region subtag. size_t regionLength = (sep ? sep : lang.end()) - region; - if (!mozilla::intl::IsStructurallyValidRegionTag( - {region, regionLength})) { + if (!IsStructurallyValidRegionTag({region, regionLength})) { continue; } diff --git a/js/src/builtin/intl/StringAsciiChars.h b/js/src/builtin/intl/StringAsciiChars.h deleted file mode 100644 index 64408c68e79a..000000000000 --- a/js/src/builtin/intl/StringAsciiChars.h +++ /dev/null @@ -1,78 +0,0 @@ -/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- - * vim: set ts=8 sts=2 et sw=2 tw=80: - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -#ifndef builtin_intl_StringAsciiChars_h -#define builtin_intl_StringAsciiChars_h - -#include "mozilla/Assertions.h" -#include "mozilla/Attributes.h" -#include "mozilla/Maybe.h" -#include "mozilla/Span.h" -#include "mozilla/TextUtils.h" - -#include - -#include "js/GCAPI.h" -#include "js/TypeDecls.h" -#include "js/Vector.h" - -#include "vm/StringType.h" - -namespace js::intl { - -/** - * String view of an ASCII-only string. - * - * This holds a reference to a JSLinearString and can produce a string view - * into that string. If the string is represented by Latin1 characters, the - * span is returned directly. If the string is represented by UTF-16 - * characters, it copies the char16_t characters into a char array, and then - * returns a span based on the copy. - * - * This allows us to avoid copying for the common use case that the ASCII - * characters are represented in Latin1. - */ -class MOZ_STACK_CLASS StringAsciiChars final { - // When copying string characters, use this many bytes of inline storage. - static const size_t InlineCapacity = 24; - - JS::AutoCheckCannotGC nogc_; - - JSLinearString* str_; - - mozilla::Maybe> ownChars_; - - public: - explicit StringAsciiChars(JSLinearString* str) : str_(str) { - MOZ_ASSERT(StringIsAscii(str)); - } - - operator mozilla::Span() const { - if (str_->hasLatin1Chars()) { - return mozilla::AsChars(str_->latin1Range(nogc_)); - } - return mozilla::AsChars(mozilla::Span(*ownChars_)); - } - - [[nodiscard]] bool init(JSContext* cx) { - if (str_->hasLatin1Chars()) { - return true; - } - - ownChars_.emplace(cx); - if (!ownChars_->resize(str_->length())) { - return false; - } - - js::CopyChars(ownChars_->begin(), *str_); - - return true; - } -}; - -} // namespace js::intl - -#endif // builtin_intl_StringAsciiChars_h diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py index a7e1661f25f8..c9ced209f183 100755 --- a/js/src/builtin/intl/make_intl_data.py +++ b/js/src/builtin/intl/make_intl_data.py @@ -17,8 +17,7 @@ This script extracts information about 1) mappings between deprecated and current Unicode BCP 47 locale identifiers, and 2) deprecated and current BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping - code in intl/components/LocaleGenerated.cpp. The code is used in - intl/components/Locale.cpp. + code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp. Target "tzdata": @@ -126,7 +125,7 @@ def writeMappingsBinarySearch( writeMappingHeader(println, description, source, url) println( """ -bool mozilla::intl::Locale::{0}({1} {2}) {{ +bool js::intl::LanguageTag::{0}({1} {2}) {{ MOZ_ASSERT({3}({2}.span())); MOZ_ASSERT({4}({2}.span())); """.format( @@ -310,7 +309,7 @@ def writeComplexLanguageTagMappings( writeMappingHeader(println, description, source, url) println( """ -void mozilla::intl::Locale::performComplexLanguageMappings() { +void js::intl::LanguageTag::performComplexLanguageMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); """.lstrip() @@ -407,7 +406,7 @@ def writeComplexRegionTagMappings( writeMappingHeader(println, description, source, url) println( """ -void mozilla::intl::Locale::performComplexRegionMappings() { +void js::intl::LanguageTag::performComplexRegionMappings() { MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); MOZ_ASSERT(IsStructurallyValidRegionTag(region().span())); @@ -525,7 +524,7 @@ static const char* ToCharPointer(const char* str) { return str; } -static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) { +static const char* ToCharPointer(const js::UniqueChars& str) { return str.get(); } @@ -538,7 +537,7 @@ static bool IsLessThan(const T& a, const U& b) { writeMappingHeader(println, description, source, url) println( """ -bool mozilla::intl::Locale::performVariantMappings() { +bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) { // The variant subtags need to be sorted for binary search. MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), IsLessThan)); @@ -548,9 +547,9 @@ bool mozilla::intl::Locale::performVariantMappings() { }; auto insertVariantSortedIfNotPresent = [&](const char* variant) { - auto* p = std::lower_bound( - variants_.begin(), variants_.end(), variant, - IsLessThan); + auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, + IsLessThan); // Don't insert the replacement when already present. if (p != variants_.end() && strcmp(p->get(), variant) == 0) { @@ -558,11 +557,14 @@ bool mozilla::intl::Locale::performVariantMappings() { } // Insert the preferred variant in sort order. - auto preferred = DuplicateStringToUniqueChars(variant); + auto preferred = DuplicateString(cx, variant); + if (!preferred) { + return false; + } return !!variants_.insert(p, std::move(preferred)); }; - for (size_t i = 0; i < variants_.length();) { + for (size_t i = 0; i < variants_.length(); ) { const char* variant = variants_[i].get(); MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant))); """.lstrip() @@ -655,7 +657,7 @@ def writeLegacyMappingsFunction(println, legacy_mappings, description, source, u writeMappingHeader(println, description, source, url) println( """\ -bool mozilla::intl::Locale::updateLegacyMappings() { +bool js::intl::LanguageTag::updateLegacyMappings(JSContext* cx) { // We're mapping legacy tags to non-legacy form here. // Other tags remain unchanged. // @@ -670,10 +672,8 @@ bool mozilla::intl::Locale::updateLegacyMappings() { } for ([[maybe_unused]] const auto& variant : variants()) { - MOZ_ASSERT( - IsStructurallyValidVariantTag(mozilla::MakeStringSpan(variant.get()))); - MOZ_ASSERT( - IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get()))); + MOZ_ASSERT(IsStructurallyValidVariantTag(mozilla::MakeStringSpan(variant.get()))); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant.get()))); } // The variant subtags need to be sorted for binary search. @@ -702,7 +702,10 @@ bool mozilla::intl::Locale::updateLegacyMappings() { } // Insert the preferred variant in sort order. - auto preferred = DuplicateStringToUniqueChars(variant); + auto preferred = DuplicateString(cx, variant); + if (!preferred) { + return false; + } return !!variants_.insert(p, std::move(preferred)); }; @@ -921,7 +924,7 @@ def writeSignLanguageMappingsFunction( writeMappingHeader(println, description, source, url) println( """\ -bool mozilla::intl::Locale::signLanguageMapping(LanguageSubtag& language, +bool js::intl::LanguageTag::signLanguageMapping(LanguageSubtag& language, const RegionSubtag& region) { MOZ_ASSERT(language.equalTo("sgn")); MOZ_ASSERT(IsStructurallyValidRegionTag(region.span())); @@ -1643,36 +1646,39 @@ def writeCLDRLanguageTagData(println, data, url): #include #include -#include "mozilla/intl/Locale.h" +#include "builtin/intl/LanguageTag.h" +#include "util/Text.h" +#include "vm/JSContext.h" -using namespace mozilla::intl::LanguageTagLimits; +using namespace js::intl::LanguageTagLimits; template static inline bool HasReplacement( const char (&subtags)[Length][TagLength], - const mozilla::intl::LanguageTagSubtag& subtag) { + const js::intl::LanguageTagSubtag& subtag) { MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); const char* ptr = subtag.span().data(); return std::binary_search(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { - return memcmp(a, b, TagLength - 1) < 0; - }); + return memcmp(a, b, TagLength - 1) < 0; + }); } template static inline const char* SearchReplacement( - const char (&subtags)[Length][TagLength], const char* (&aliases)[Length], - const mozilla::intl::LanguageTagSubtag& subtag) { + const char (&subtags)[Length][TagLength], + const char* (&aliases)[Length], + const js::intl::LanguageTagSubtag& subtag) { MOZ_ASSERT(subtag.length() == TagLength - 1, "subtag must have the same length as the list of subtags"); const char* ptr = subtag.span().data(); auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, [](const char* a, const char* b) { - return memcmp(a, b, TagLength - 1) < 0; - }); + return memcmp(a, b, TagLength - 1) < 0; + }); if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { return aliases[std::distance(std::begin(subtags), p)]; } @@ -1689,23 +1695,32 @@ static bool IsAsciiLowercaseAlphanumericOrDash(char c) { } static bool IsCanonicallyCasedLanguageTag(mozilla::Span span) { - return std::all_of(span.begin(), span.end(), - mozilla::IsAsciiLowercaseAlpha); + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha); } static bool IsCanonicallyCasedScriptTag(mozilla::Span span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + return mozilla::IsAsciiUppercaseAlpha(span[0]) && - std::all_of(span.begin() + 1, span.end(), - mozilla::IsAsciiLowercaseAlpha); + std::all_of(span.begin() + 1, span.end(), mozilla::IsAsciiLowercaseAlpha); } static bool IsCanonicallyCasedRegionTag(mozilla::Span span) { - return std::all_of(span.begin(), span.end(), - mozilla::IsAsciiUppercaseAlpha) || + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha) || std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit); } static bool IsCanonicallyCasedVariantTag(mozilla::Span span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); } @@ -1714,8 +1729,7 @@ static bool IsCanonicallyCasedUnicodeKey(mozilla::Span key) { } static bool IsCanonicallyCasedUnicodeType(mozilla::Span type) { - return std::all_of(type.begin(), type.end(), - IsAsciiLowercaseAlphanumericOrDash); + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); } static bool IsCanonicallyCasedTransformKey(mozilla::Span key) { @@ -1723,8 +1737,7 @@ static bool IsCanonicallyCasedTransformKey(mozilla::Span key) { } static bool IsCanonicallyCasedTransformType(mozilla::Span type) { - return std::all_of(type.begin(), type.end(), - IsAsciiLowercaseAlphanumericOrDash); + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); } #endif """.rstrip() @@ -2036,7 +2049,7 @@ def readCLDRVersionFromICU(): def updateCLDRLangTags(args): - """ Update the LocaleGenerated.cpp file. """ + """ Update the LanguageTagGenerated.cpp file. """ version = args.version url = args.url out = args.out @@ -3191,14 +3204,16 @@ def writeUnicodeExtensionsMappings(println, mapping, extension): println( """ template -static inline bool Is{0}Key(mozilla::Span key, const char (&str)[Length]) {{ +static inline bool Is{0}Key( + mozilla::Span key, const char (&str)[Length]) {{ static_assert(Length == {0}KeyLength + 1, "{0} extension key is two characters long"); return memcmp(key.data(), str, Length - 1) == 0; }} template -static inline bool Is{0}Type(mozilla::Span type, const char (&str)[Length]) {{ +static inline bool Is{0}Type( + mozilla::Span type, const char (&str)[Length]) {{ static_assert(Length > {0}KeyLength + 1, "{0} extension type contains more than two characters"); return type.size() == (Length - 1) && @@ -3247,8 +3262,8 @@ static inline const char* Search{0}Replacement( auto p = std::lower_bound(std::begin(types), std::end(types), type, [](const auto& a, const auto& b) {{ - return Compare{0}Type(a, b) < 0; - }}); + return Compare{0}Type(a, b) < 0; + }}); if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{ return aliases[std::distance(std::begin(types), p)]; }} @@ -3270,7 +3285,7 @@ static inline const char* Search{0}Replacement( * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files * Spec: https://www.unicode.org/reports/tr35/#t_Extension */ -const char* mozilla::intl::Locale::replace{0}ExtensionType( +const char* js::intl::LanguageTag::replace{0}ExtensionType( mozilla::Span key, mozilla::Span type) {{ MOZ_ASSERT(key.size() == {0}KeyLength); MOZ_ASSERT(IsCanonicallyCased{0}Key(key)); @@ -3292,11 +3307,11 @@ const char* mozilla::intl::Locale::replace{0}ExtensionType( for entries in grouper(subtags, max_entries): entries = ( - '"{}"'.format(tag).center(length + 2) + '"{}"'.format(tag).rjust(length + 2) for tag in entries if tag is not None ) - println(" {},".format(", ".join(entries))) + println(" {},".format(", ".join(entries))) println(" };") @@ -4039,9 +4054,7 @@ if __name__ == "__main__": ) parser_cldr_tags.add_argument( "--out", - default=os.path.join( - topsrcdir, "intl", "components", "src", "LocaleGenerated.cpp" - ), + default="LanguageTagGenerated.cpp", help="Output file (default: %(default)s)", ) parser_cldr_tags.add_argument( diff --git a/js/src/moz.build b/js/src/moz.build index 870ea9a3ab4a..f445309008d0 100755 --- a/js/src/moz.build +++ b/js/src/moz.build @@ -478,6 +478,7 @@ if CONFIG["JS_HAS_INTL_API"]: "builtin/intl/DisplayNames.cpp", "builtin/intl/IntlObject.cpp", "builtin/intl/LanguageTag.cpp", + "builtin/intl/LanguageTagGenerated.cpp", "builtin/intl/ListFormat.cpp", "builtin/intl/Locale.cpp", "builtin/intl/NumberFormat.cpp",