diff --git a/dom/base/nsContentUtils.cpp b/dom/base/nsContentUtils.cpp index b349035b6f1b..96e3c78653c3 100644 --- a/dom/base/nsContentUtils.cpp +++ b/dom/base/nsContentUtils.cpp @@ -1919,6 +1919,33 @@ nsIBidiKeyboard* nsContentUtils::GetBidiKeyboard() { return sBidiKeyboard; } +/** + * This is used to determine whether a character is in one of the classes + * which CSS says should be part of the first-letter. Currently, that is + * all punctuation classes (P*). Note that this is a change from CSS2 + * which excluded Pc and Pd. + * + * https://www.w3.org/TR/css-pseudo-4/#first-letter-pseudo + * "Punctuation (i.e, characters that belong to the Punctuation (P*) Unicode + * general category [UAX44]) [...]" + */ + +// static +bool nsContentUtils::IsFirstLetterPunctuation(uint32_t aChar) { + switch (mozilla::unicode::GetGeneralCategory(aChar)) { + case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */ + case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: /* Pd */ + case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */ + case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */ + case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */ + case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */ + case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: /* Ps */ + return true; + default: + return false; + } +} + // static bool nsContentUtils::IsAlphanumeric(uint32_t aChar) { nsUGenCategory cat = mozilla::unicode::GetGenCategory(aChar); diff --git a/dom/base/nsContentUtils.h b/dom/base/nsContentUtils.h index e8acff918e31..96044e9c8c46 100644 --- a/dom/base/nsContentUtils.h +++ b/dom/base/nsContentUtils.h @@ -809,6 +809,11 @@ class nsContentUtils { static const nsDependentSubstring TrimWhitespace(const nsAString& aStr, bool aTrimTrailing = true); + /** + * Returns true if aChar is of class Ps, Pi, Po, Pf, or Pe. + */ + static bool IsFirstLetterPunctuation(uint32_t aChar); + /** * Returns true if aChar is of class Lu, Ll, Lt, Lm, Lo, Nd, Nl or No */ diff --git a/layout/generic/nsTextFrame.cpp b/layout/generic/nsTextFrame.cpp index f07144f2e4d6..613443dd081b 100644 --- a/layout/generic/nsTextFrame.cpp +++ b/layout/generic/nsTextFrame.cpp @@ -8402,61 +8402,15 @@ std::pair nsTextFrame::GetOffsets() const { return std::make_pair(GetContentOffset(), GetContentEnd()); } -static bool IsFirstLetterPrefixPunctuation(uint32_t aChar) { - switch (mozilla::unicode::GetGeneralCategory(aChar)) { - case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */ - case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: /* Pd */ - case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */ - case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */ - case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */ - case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */ - case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: /* Ps */ - return true; - default: - return false; - } -} - -static bool IsFirstLetterSuffixPunctuation(uint32_t aChar) { - switch (mozilla::unicode::GetGeneralCategory(aChar)) { - case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */ - case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */ - case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */ - case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */ - case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */ - return true; - default: - return false; - } -} - -static int32_t FindEndOfPrefixPunctuationRun(const nsTextFragment* aFrag, - const gfxTextRun* aTextRun, - gfxSkipCharsIterator* aIter, - int32_t aOffset, int32_t aStart, - int32_t aEnd) { +static int32_t FindEndOfPunctuationRun(const nsTextFragment* aFrag, + const gfxTextRun* aTextRun, + gfxSkipCharsIterator* aIter, + int32_t aOffset, int32_t aStart, + int32_t aEnd) { int32_t i; - for (i = aStart; i < aEnd - aOffset; ++i) { - if (IsFirstLetterPrefixPunctuation( - aFrag->ScalarValueAt(AssertedCast(aOffset + i)))) { - aIter->SetOriginalOffset(aOffset + i); - FindClusterEnd(aTextRun, aEnd, aIter); - i = aIter->GetOriginalOffset() - aOffset; - } else { - break; - } - } - return i; -} -static int32_t FindEndOfSuffixPunctuationRun(const nsTextFragment* aFrag, - const gfxTextRun* aTextRun, - gfxSkipCharsIterator* aIter, - int32_t aOffset, int32_t aStart, - int32_t aEnd) { - int32_t i; for (i = aStart; i < aEnd - aOffset; ++i) { - if (IsFirstLetterSuffixPunctuation( + if (nsContentUtils::IsFirstLetterPunctuation( aFrag->ScalarValueAt(AssertedCast(aOffset + i)))) { aIter->SetOriginalOffset(aOffset + i); FindClusterEnd(aTextRun, aEnd, aIter); @@ -8486,6 +8440,7 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag, const gfxTextRun* aTextRun, int32_t aOffset, const gfxSkipCharsIterator& aIter, int32_t* aLength) { + int32_t i; int32_t length = *aLength; int32_t endOffset = aOffset + length; gfxSkipCharsIterator iter(aIter); @@ -8509,39 +8464,25 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag, return false; }; - // Skip any trimmable leading whitespace. - int32_t i = GetTrimmableWhitespaceCount(aFrag, aOffset, length, 1); - while (true) { - // Scan past any leading punctuation. This leaves `j` at the first - // non-punctuation character. - int32_t j = FindEndOfPrefixPunctuationRun(aFrag, aTextRun, &iter, aOffset, - i, endOffset); - if (j == length) { - return false; - } + // skip leading whitespace, then consume clusters that start with punctuation + i = FindEndOfPunctuationRun( + aFrag, aTextRun, &iter, aOffset, + GetTrimmableWhitespaceCount(aFrag, aOffset, length, 1), endOffset); + if (i == length) { + return false; + } - // Scan past any Unicode whitespace characters after punctuation. - while (j < length) { - char16_t ch = aFrag->CharAt(AssertedCast(aOffset + j)); - // The spec says to allow "characters that belong to the `Zs` Unicode - // general category _other than_ U+3000" here. - if (unicode::GetGeneralCategory(ch) == - HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR && - ch != 0x3000) { - ++j; - } else { - break; - } - } - if (j == length) { - return false; - } - if (j == i) { - // If no whitespace was found, we've finished the first-letter prefix; - // if there was some, then go back to check for more punctuation. + // skip space/no-break-space after punctuation + while (i < length) { + char16_t ch = aFrag->CharAt(AssertedCast(aOffset + i)); + if (ch == ' ' || ch == CH_NBSP) { + ++i; + } else { break; } - i = j; + } + if (i == length) { + return false; } // If the next character is not a letter, number or symbol, there is no @@ -8554,7 +8495,7 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag, return true; } - // Consume another cluster (the actual first letter): + // consume another cluster (the actual first letter) // For complex scripts such as Indic and SEAsian, where first-letter // should extend to entire orthographic "syllable" clusters, we don't @@ -8625,12 +8566,9 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag, break; } - // NOTE that FindClusterEnd sets the iterator to the last character that is - // part of the cluster, NOT to the first character beyond it. iter.SetOriginalOffset(aOffset + i); FindClusterEnd(aTextRun, endOffset, &iter, allowSplitLigature); - // Index of the last character included in the first-letter cluster. i = iter.GetOriginalOffset() - aOffset; // Heuristic for Indic scripts that like to form conjuncts: @@ -8678,44 +8616,9 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag, } } - // When we reach here, `i` points to the last character of the first-letter - // cluster, NOT to the first character beyond it. Advance to the next char, - // ready to check for following whitespace/punctuation: - ++i; - - while (i < length) { - // Skip over whitespace, except for word separator characters, before the - // check for following punctuation. But remember the position before the - // whitespace, in case we need to reset. - const int32_t preWS = i; - while (i < length) { - char16_t ch = aFrag->CharAt(AssertedCast(aOffset + i)); - // The spec says the first-letter suffix includes "any intervening - // typographic space -- characters belonging to the Zs Unicode general - // category other than U+3000 IDEOGRAPHIC SPACE or a word separator", - // where "word separator" includes U+0020 and U+00A0. - if (ch == 0x0020 || ch == 0x00A0 || ch == 0x3000 || - unicode::GetGeneralCategory(ch) != - HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR) { - break; - } else { - ++i; - } - } - - // Consume clusters that start with punctuation. - const int32_t prePunct = i; - i = FindEndOfSuffixPunctuationRun(aFrag, aTextRun, &iter, aOffset, i, - endOffset); - - // If we didn't find punctuation here, then we also don't want to include - // any preceding whitespace, so reset our index. - if (i == prePunct) { - i = preWS; - break; - } - } - + // consume clusters that start with punctuation + i = FindEndOfPunctuationRun(aFrag, aTextRun, &iter, aOffset, i + 1, + endOffset); if (i < length) { *aLength = i; }