Bug 1935148 - Remove newline (instead of transforming to space) if adjacent to East-Asian punctuation character. r=m_kato

Includes the examples from the report as a testcase, though there is not yet any formal spec for the exact behavior of segment break transformation. (But nevertheless there is an existing collection of tests, so this just adds one for the punctuation case.) Differential Revision: https://phabricator.services.mozilla.com/D231476
2025-01-02 10:17:53 +00:00
parent 566b9e36dd
commit b7a8f00f5e
8 changed files with 130 additions and 25 deletions
--- a/intl/components/src/UnicodeProperties.h
+++ b/intl/components/src/UnicodeProperties.h
@@ -171,6 +171,23 @@ class UnicodeProperties final {
    return u_hasBinaryProperty(aCh, prop);
  }

+  /**
+   * Check if the width of aCh is full width, half width or wide.
+   */
+  static inline bool IsEastAsianWidthFHW(uint32_t aCh) {
+    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
+      case U_EA_FULLWIDTH:
+      case U_EA_HALFWIDTH:
+      case U_EA_WIDE:
+        return true;
+      case U_EA_AMBIGUOUS:
+      case U_EA_NARROW:
+      case U_EA_NEUTRAL:
+        return false;
+    }
+    return false;
+  }
+
  /**
   * Check if the width of aCh is full width, half width or wide
   * excluding emoji.
@@ -224,6 +241,24 @@ class UnicodeProperties final {
    return false;
  }

+  /**
+   * Check if the CharType of aCh is a punctuation type.
+   */
+  static inline bool IsPunctuation(uint32_t aCh) {
+    switch (CharType(aCh)) {
+      case GeneralCategory::Dash_Punctuation:
+      case GeneralCategory::Open_Punctuation:
+      case GeneralCategory::Close_Punctuation:
+      case GeneralCategory::Connector_Punctuation:
+      case GeneralCategory::Other_Punctuation:
+      case GeneralCategory::Initial_Punctuation:
+      case GeneralCategory::Final_Punctuation:
+        return true;
+      default:
+        return false;
+    }
+  }
+
  /**
   * Check if the CharType of aCh is math or other symbol.
   */
--- a/intl/unicharutil/util/nsUnicharUtils.cpp
+++ b/intl/unicharutil/util/nsUnicharUtils.cpp
@@ -520,6 +520,11 @@ bool IsSegmentBreakSkipChar(uint32_t u) {
         intl::UnicodeProperties::GetScriptCode(u) != intl::Script::HANGUL;
 }

+bool IsEastAsianPunctuation(uint32_t u) {
+  return intl::UnicodeProperties::IsEastAsianWidthFHW(u) &&
+         intl::UnicodeProperties::IsPunctuation(u);
+}
+
 bool IsPunctuationForWordSelect(char16_t aCh) {
  const uint8_t cat = unicode::GetGeneralCategory(aCh);
  switch (cat) {
--- a/intl/unicharutil/util/nsUnicharUtils.h
+++ b/intl/unicharutil/util/nsUnicharUtils.h
@@ -153,7 +153,12 @@ namespace mozilla {
 */
 uint32_t HashUTF8AsUTF16(const char* aUTF8, size_t aLength, bool* aErr);

+/**
+ * Tests used in CSS Segment Break Transformation to determine whether a
+ * newline is discardable.
+ */
 bool IsSegmentBreakSkipChar(uint32_t u);
+bool IsEastAsianPunctuation(uint32_t u);

 /**
 * Return true for all Punctuation categories (Unicode general category P?),
--- a/layout/generic/nsTextFrame.cpp
+++ b/layout/generic/nsTextFrame.cpp
@@ -1322,6 +1322,7 @@ BuildTextRunsScanner::FindBoundaryResult BuildTextRunsScanner::FindBoundaries(
    uint32_t start = textFrame->GetContentOffset();
    uint32_t length = textFrame->GetContentLength();
    const void* text;
+    const nsAtom* language = textFrame->StyleFont()->mLanguage;
    if (frag->Is2b()) {
      // It is possible that we may end up removing all whitespace in
      // a piece of text because of The White Space Processing Rules,
@@ -1336,7 +1337,7 @@ BuildTextRunsScanner::FindBoundaryResult BuildTextRunsScanner::FindBoundaries(
      char16_t* bufStart = aState->mBuffer.Elements();
      char16_t* bufEnd = nsTextFrameUtils::TransformText(
          frag->Get2b() + start, length, bufStart, compression, &incomingFlags,
-          &skipChars, &analysisFlags);
+          &skipChars, &analysisFlags, language);
      text = bufStart;
      length = bufEnd - bufStart;
    } else {
@@ -2318,6 +2319,7 @@ already_AddRefed<gfxTextRun> BuildTextRunsScanner::BuildTextRunForFrames(
    int32_t contentStart = mappedFlow->mStartFrame->GetContentOffset();
    int32_t contentEnd = mappedFlow->GetContentEnd();
    int32_t contentLength = contentEnd - contentStart;
+    const nsAtom* language = f->StyleFont()->mLanguage;

    TextRunMappedFlow* newFlow = &userMappedFlows[i];
    newFlow->mStartFrame = mappedFlow->mStartFrame;
@@ -2339,7 +2341,7 @@ already_AddRefed<gfxTextRun> BuildTextRunsScanner::BuildTextRunForFrames(
      char16_t* bufStart = static_cast<char16_t*>(aTextBuffer);
      char16_t* bufEnd = nsTextFrameUtils::TransformText(
          frag->Get2b() + contentStart, contentLength, bufStart, compression,
-          &mNextRunContextInfo, &skipChars, &analysisFlags);
+          &mNextRunContextInfo, &skipChars, &analysisFlags, language);
      aTextBuffer = bufEnd;
      currentTransformedTextOffset =
          bufEnd - static_cast<const char16_t*>(textPtr);
@@ -2356,7 +2358,7 @@ already_AddRefed<gfxTextRun> BuildTextRunsScanner::BuildTextRunForFrames(
        uint8_t* end = nsTextFrameUtils::TransformText(
            reinterpret_cast<const uint8_t*>(frag->Get1b()) + contentStart,
            contentLength, bufStart, compression, &mNextRunContextInfo,
-            &skipChars, &analysisFlags);
+            &skipChars, &analysisFlags, language);
        aTextBuffer =
            ExpandBuffer(static_cast<char16_t*>(aTextBuffer),
                         tempBuf.Elements(), end - tempBuf.Elements());
@@ -2367,7 +2369,7 @@ already_AddRefed<gfxTextRun> BuildTextRunsScanner::BuildTextRunForFrames(
        uint8_t* end = nsTextFrameUtils::TransformText(
            reinterpret_cast<const uint8_t*>(frag->Get1b()) + contentStart,
            contentLength, bufStart, compression, &mNextRunContextInfo,
-            &skipChars, &analysisFlags);
+            &skipChars, &analysisFlags, language);
        aTextBuffer = end;
        currentTransformedTextOffset =
            end - static_cast<const uint8_t*>(textPtr);
@@ -2618,6 +2620,7 @@ bool BuildTextRunsScanner::SetupLineBreakerContext(gfxTextRun* aTextRun) {
  }

  gfxSkipChars skipChars;
+  const nsAtom* language = mMappedFlows[0].mStartFrame->StyleFont()->mLanguage;

  for (uint32_t i = 0; i < mMappedFlows.Length(); ++i) {
    MappedFlow* mappedFlow = &mMappedFlows[i];
@@ -2639,7 +2642,7 @@ bool BuildTextRunsScanner::SetupLineBreakerContext(gfxTextRun* aTextRun) {
      char16_t* bufStart = static_cast<char16_t*>(textPtr);
      char16_t* bufEnd = nsTextFrameUtils::TransformText(
          frag->Get2b() + contentStart, contentLength, bufStart, compression,
-          &mNextRunContextInfo, &skipChars, &analysisFlags);
+          &mNextRunContextInfo, &skipChars, &analysisFlags, language);
      textPtr = bufEnd;
    } else {
      if (mDoubleByteText) {
@@ -2653,7 +2656,7 @@ bool BuildTextRunsScanner::SetupLineBreakerContext(gfxTextRun* aTextRun) {
        uint8_t* end = nsTextFrameUtils::TransformText(
            reinterpret_cast<const uint8_t*>(frag->Get1b()) + contentStart,
            contentLength, bufStart, compression, &mNextRunContextInfo,
-            &skipChars, &analysisFlags);
+            &skipChars, &analysisFlags, language);
        textPtr = ExpandBuffer(static_cast<char16_t*>(textPtr),
                               tempBuf.Elements(), end - tempBuf.Elements());
      } else {
@@ -2661,7 +2664,7 @@ bool BuildTextRunsScanner::SetupLineBreakerContext(gfxTextRun* aTextRun) {
        uint8_t* end = nsTextFrameUtils::TransformText(
            reinterpret_cast<const uint8_t*>(frag->Get1b()) + contentStart,
            contentLength, bufStart, compression, &mNextRunContextInfo,
-            &skipChars, &analysisFlags);
+            &skipChars, &analysisFlags, language);
        textPtr = end;
      }
    }
--- a/layout/generic/nsTextFrameUtils.cpp
+++ b/layout/generic/nsTextFrameUtils.cpp
@@ -86,7 +86,8 @@ static CharT* TransformWhiteSpaces(
    const CharT* aText, uint32_t aLength, uint32_t aBegin, uint32_t aEnd,
    bool aHasSegmentBreak, bool& aInWhitespace, CharT* aOutput,
    nsTextFrameUtils::Flags& aFlags,
-    nsTextFrameUtils::CompressionMode aCompression, gfxSkipChars* aSkipChars) {
+    nsTextFrameUtils::CompressionMode aCompression, gfxSkipChars* aSkipChars,
+    bool aLangIsJapaneseOrChinese) {
  MOZ_ASSERT(aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
                 aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE_NEWLINE,
             "whitespaces should be skippable!!");
@@ -126,10 +127,15 @@ static CharT* TransformWhiteSpaces(
      }
    } while (IsDefaultIgnorable(ucs4after) && pos < aLength);

-    // Discard newlines between characters that have F, W, or H
-    // EastAsianWidth property and neither side is Hangul.
+    // Discard newlines between characters that have F, W, or H EastAsianWidth
+    // property and neither side is Hangul.
+    // For Japanese/Chinese, also discard if *either* character is a fullwidth/
+    // wide punctuation character.
    isSegmentBreakSkippable =
-        IsSegmentBreakSkipChar(ucs4before) && IsSegmentBreakSkipChar(ucs4after);
+        (IsSegmentBreakSkipChar(ucs4before) &&
+         IsSegmentBreakSkipChar(ucs4after)) ||
+        (aLangIsJapaneseOrChinese && (IsEastAsianPunctuation(ucs4before) ||
+                                      IsEastAsianPunctuation(ucs4after)));
  }

  for (uint32_t i = aBegin; i < aEnd; ++i) {
@@ -201,12 +207,10 @@ static CharT* TransformWhiteSpaces(
 }

 template <class CharT>
-CharT* nsTextFrameUtils::TransformText(const CharT* aText, uint32_t aLength,
-                                       CharT* aOutput,
-                                       CompressionMode aCompression,
-                                       uint8_t* aIncomingFlags,
-                                       gfxSkipChars* aSkipChars,
-                                       Flags* aAnalysisFlags) {
+CharT* nsTextFrameUtils::TransformText(
+    const CharT* aText, uint32_t aLength, CharT* aOutput,
+    CompressionMode aCompression, uint8_t* aIncomingFlags,
+    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage) {
  Flags flags = Flags();
 #ifdef DEBUG
  int32_t skipCharsOffset = aSkipChars->GetOriginalCharCount();
@@ -247,6 +251,19 @@ CharT* nsTextFrameUtils::TransformText(const CharT* aText, uint32_t aLength,
    }
    *aIncomingFlags &= ~INCOMING_WHITESPACE;
  } else {
+    bool langIsJapaneseOrChinese = [=]() {
+      if (!aLanguage || aLanguage->GetLength() < 2) {
+        return false;
+      }
+      const char16_t* text = aLanguage->GetUTF16String();
+      if ((ToLowerCaseASCII(text[0]) == char16_t('j') &&
+           ToLowerCaseASCII(text[1]) == char16_t('a')) ||
+          (ToLowerCaseASCII(text[0]) == char16_t('z') &&
+           ToLowerCaseASCII(text[1]) == char16_t('h'))) {
+        return aLanguage->GetLength() == 2 || text[2] == '-';
+      }
+      return false;
+    }();
    bool inWhitespace = (*aIncomingFlags & INCOMING_WHITESPACE) != 0;
    uint32_t i;
    for (i = 0; i < aLength; ++i) {
@@ -284,9 +301,9 @@ CharT* nsTextFrameUtils::TransformText(const CharT* aText, uint32_t aLength,
          j--;
        }
        if (j > i) {
-          aOutput = TransformWhiteSpaces(aText, aLength, i, j, hasSegmentBreak,
-                                         inWhitespace, aOutput, flags,
-                                         aCompression, aSkipChars);
+          aOutput = TransformWhiteSpaces(
+              aText, aLength, i, j, hasSegmentBreak, inWhitespace, aOutput,
+              flags, aCompression, aSkipChars, langIsJapaneseOrChinese);
        }
        // We need to keep KeepChar()/SkipChar() in order, so process the
        // last white space first, then process the trailing discardables.
@@ -347,11 +364,11 @@ CharT* nsTextFrameUtils::TransformText(const CharT* aText, uint32_t aLength,
 template uint8_t* nsTextFrameUtils::TransformText(
    const uint8_t* aText, uint32_t aLength, uint8_t* aOutput,
    CompressionMode aCompression, uint8_t* aIncomingFlags,
-    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags);
+    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage);
 template char16_t* nsTextFrameUtils::TransformText(
    const char16_t* aText, uint32_t aLength, char16_t* aOutput,
    CompressionMode aCompression, uint8_t* aIncomingFlags,
-    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags);
+    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage);
 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
    uint8_t aChar);
 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
--- a/layout/generic/nsTextFrameUtils.h
+++ b/layout/generic/nsTextFrameUtils.h
@@ -10,6 +10,7 @@
 #include "gfxSkipChars.h"
 #include "nsBidiUtils.h"

+class nsAtom;
 class nsIContent;
 struct nsStyleText;

@@ -121,12 +122,15 @@ class nsTextFrameUtils {
   * @param aIncomingFlags a flag indicating whether there was whitespace
   * or an Arabic character preceding this text. We set it to indicate if
   * there's an Arabic character or whitespace preceding the end of this text.
+   * @param aLanguage Content language (used to select Japanese/Chinese behavior
+   * at punctuation, see https://bugzilla.mozilla.org/show_bug.cgi?id=1935148).
   */
  template <class CharT>
  static CharT* TransformText(const CharT* aText, uint32_t aLength,
                              CharT* aOutput, CompressionMode aCompression,
                              uint8_t* aIncomingFlags, gfxSkipChars* aSkipChars,
-                              nsTextFrameUtils::Flags* aAnalysisFlags);
+                              nsTextFrameUtils::Flags* aAnalysisFlags,
+                              const nsAtom* aLanguage);

  /**
   * Returns whether aChar is a character that nsTextFrameUtils::TransformText
--- a/testing/web-platform/tests/css/css-text/line-breaking/segment-break-transformation-punctuation-001-ref.html
+++ b/testing/web-platform/tests/css/css-text/line-breaking/segment-break-transformation-punctuation-001-ref.html
@@ -0,0 +1,7 @@
+<!DOCTYPE html>
+<meta charset="utf-8">
+<title>Segment break transformation: CJK punctuation</title>
+<link rel="author" href="mailto:jkew@mozilla.com">
+
+<p lang="ja">本システムはサポート切れのブラウザに対応しません。Internet Explorerをお使いの場合、Edge・Chrome・Firefoxなどに移行してください。(EdgeはChromium阪をお使いください)</p>
+<p lang="ja">ﾕｰｻﾞﾒｲ｢ｼﾞｮﾝ･ｽﾐｽ｣､ID｢smith｣ﾉｱｶｳﾝﾄｦｼｮｳｷｮｼﾏｽ｡y/N</p>
--- a/testing/web-platform/tests/css/css-text/line-breaking/segment-break-transformation-punctuation-001.html
+++ b/testing/web-platform/tests/css/css-text/line-breaking/segment-break-transformation-punctuation-001.html
@@ -0,0 +1,29 @@
+<!DOCTYPE html>
+<meta charset="utf-8">
+<title>Segment break transformation: CJK punctuation</title>
+<link rel="author" href="mailto:jkew@mozilla.com">
+<link rel="match" href="segment-break-transformation-punctuation-001-ref.html">
+<link rel="help" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1935148">
+<meta name="flags" content="should" />
+
+<p lang="ja">
+  本システムはサポート切れのブラウザに対応しません。
+  Internet Explorerをお使いの場合、
+  Edge
+  ・
+  Chrome
+  ・
+  Firefoxなどに移行してください。
+  (EdgeはChromium阪をお使いください)
+</p>
+<p lang="ja">
+  ﾕｰｻﾞﾒｲ
+  ｢ｼﾞｮﾝ
+  ･
+  ｽﾐｽ｣
+  ､
+  ID
+  ｢smith｣
+  ﾉｱｶｳﾝﾄｦｼｮｳｷｮｼﾏｽ｡
+  y/N
+</p>