Bug 1648464 - Add an Autodetect item to the Text Encoding menu. r=emk,chutten,Gijs

Take a step towards replacing the encoding menu with a single menu item that
triggers the autodetection manually. However, don't remove anything for now.

* Add an autodetect item.
* Add telemetry for autodetect used in session.
* Add telemetry for non-autodetect used in session.
* Restore and revise telemetry for how the encoding that is being overridden
  was discovered.

Differential Revision: https://phabricator.services.mozilla.com/D81132
This commit is contained in:
Henri Sivonen
2021-01-14 07:06:53 +00:00
parent 8f03ff4322
commit dc81bb2634
14 changed files with 372 additions and 117 deletions

View File

@@ -285,20 +285,56 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
if (!aInitial) {
mGuessEncoding = false;
}
auto encoding = mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
mCharsetSource == kCharsetFromInitialUserForcedAutoDetection);
MOZ_ASSERT(
mCharsetSource != kCharsetFromFinalJapaneseAutoDetection &&
mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
mCharsetSource != kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8 &&
mCharsetSource != kCharsetFromFinalAutoDetectionFile);
auto encoding =
forced ? mDetector->Guess(EmptyCString(), true)
: mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
auto source =
aInitial
? (forced ? kCharsetFromInitialUserForcedAutoDetection
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8)
: (forced
? kCharsetFromFinalUserForcedAutoDetection
: (mDecodingLocalFileWithoutTokenizing
? kCharsetFromFinalAutoDetectionFile
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8));
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8;
}
} else if (source == kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8) {
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
}
}
if (HasDecoder() && !mDecodingLocalFileWithoutTokenizing) {
if (mEncoding == encoding) {
auto source = aInitial ? kCharsetFromInitialAutoDetection
: kCharsetFromFinalAutoDetection;
MOZ_ASSERT(mCharsetSource < source, "Why are we running chardet at all?");
// Source didn't actually change between initial and final, so roll it
// back for future telemetry purposes, while taking into account the final
// UTF-8ness. https://bugzilla.mozilla.org/show_bug.cgi?id=1686463
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8;
} else if (source == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
} else if (source == kCharsetFromFinalUserForcedAutoDetection) {
source = kCharsetFromInitialUserForcedAutoDetection;
}
mCharsetSource = source;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else {
MOZ_ASSERT(mCharsetSource < kCharsetFromFinalAutoDetection);
MOZ_ASSERT(mCharsetSource < kCharsetFromFinalJapaneseAutoDetection ||
forced);
// We've already committed to a decoder. Request a reload from the
// docshell.
mTreeBuilder->NeedsCharsetSwitchTo(encoding,
kCharsetFromFinalAutoDetection, 0);
mTreeBuilder->NeedsCharsetSwitchTo(encoding, source, 0);
FlushTreeOpsAndDisarmTimer();
Interrupt();
}
@@ -306,8 +342,7 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
mEncoding = encoding;
mCharsetSource = aInitial ? kCharsetFromInitialAutoDetection
: kCharsetFromFinalAutoDetection;
mCharsetSource = source;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
}
@@ -320,9 +355,9 @@ void nsHtml5StreamParser::FeedJapaneseDetector(Span<const uint8_t> aBuffer,
return;
}
DontGuessEncoding();
int32_t source = kCharsetFromFinalAutoDetection;
int32_t source = kCharsetFromFinalJapaneseAutoDetection;
if (mCharsetSource == kCharsetFromUserForced) {
source = kCharsetFromUserForcedAutoDetection;
source = kCharsetFromUserForcedJapaneseAutoDetection;
}
if (detected == mEncoding) {
MOZ_ASSERT(mCharsetSource < source, "Why are we running chardet at all?");
@@ -396,12 +431,14 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
NS_ASSERTION(IsParserThread(), "Wrong thread!");
nsresult rv = NS_OK;
if (mDecodingLocalFileWithoutTokenizing &&
mCharsetSource <= kCharsetFromFileURLGuess) {
mCharsetSource <= kCharsetFromTopLevelDomain) {
MOZ_ASSERT(mEncoding != UTF_8_ENCODING);
mUnicodeDecoder = UTF_8_ENCODING->NewDecoderWithBOMRemoval();
} else {
if (mCharsetSource >= kCharsetFromFinalAutoDetection) {
if (mCharsetSource != kCharsetFromUserForced) {
if (mCharsetSource >= kCharsetFromFinalJapaneseAutoDetection) {
if (!(mCharsetSource == kCharsetFromUserForced ||
mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
mCharsetSource == kCharsetFromInitialUserForcedAutoDetection)) {
DontGuessEncoding();
}
mDecodingLocalFileWithoutTokenizing = false;
@@ -582,7 +619,7 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span<const uint8_t> aFromSegment,
uint32_t aCountToSniffingLimit,
bool aEof) {
MOZ_ASSERT(IsParserThread(), "Wrong thread!");
MOZ_ASSERT(mCharsetSource < kCharsetFromUserForcedAutoDetection,
MOZ_ASSERT(mCharsetSource < kCharsetFromUserForcedJapaneseAutoDetection,
"Should not finalize sniffing with strong decision already made.");
if (mMode == VIEW_SOURCE_XML) {
static const XML_Memory_Handling_Suite memsuite = {
@@ -765,6 +802,12 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
if (mSniffingLength + aFromSegment.Length() >= SNIFFING_BUFFER_SIZE) {
// this is the last buffer
uint32_t countToSniffingLimit = SNIFFING_BUFFER_SIZE - mSniffingLength;
bool forced =
(mCharsetSource == kCharsetFromUserForced ||
mCharsetSource == kCharsetFromUserForcedJapaneseAutoDetection ||
mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
mCharsetSource == kCharsetFromFinalUserForcedAutoDetection);
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
nsHtml5ByteReadable readable(
aFromSegment.Elements(),
@@ -779,19 +822,25 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
}
if (encoding) {
// meta scan successful; honor overrides unless meta is XSS-dangerous
if ((mCharsetSource == kCharsetFromUserForced) &&
(encoding->IsAsciiCompatible() ||
encoding == ISO_2022_JP_ENCODING)) {
if (forced && (encoding->IsAsciiCompatible() ||
encoding == ISO_2022_JP_ENCODING)) {
// Honor override
if (mEncoding->IsJapaneseLegacy()) {
if (mCharsetSource == kCharsetFromUserForced &&
mEncoding->IsJapaneseLegacy()) {
mFeedChardet = true;
if (!mJapaneseDetector) {
mJapaneseDetector = mozilla::JapaneseDetector::Create(true);
}
FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit,
false);
} else {
} else if (mCharsetSource ==
kCharsetFromUserForcedJapaneseAutoDetection ||
mCharsetSource ==
kCharsetFromFinalUserForcedAutoDetection) {
DontGuessEncoding();
} else {
FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit,
false);
}
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment);
@@ -803,16 +852,21 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
aFromSegment);
}
}
if (mCharsetSource == kCharsetFromUserForced) {
if (forced) {
// meta not found, honor override
if (mEncoding->IsJapaneseLegacy()) {
if (mCharsetSource == kCharsetFromUserForced &&
mEncoding->IsJapaneseLegacy()) {
mFeedChardet = true;
if (!mJapaneseDetector) {
mJapaneseDetector = mozilla::JapaneseDetector::Create(true);
}
FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit, false);
} else {
} else if (mCharsetSource ==
kCharsetFromUserForcedJapaneseAutoDetection ||
mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) {
DontGuessEncoding();
} else {
FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit, false);
}
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
}
@@ -833,15 +887,24 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
}
if (encoding) {
// meta scan successful; honor overrides unless meta is XSS-dangerous
if ((mCharsetSource == kCharsetFromUserForced) &&
if ((mCharsetSource == kCharsetFromUserForced ||
mCharsetSource == kCharsetFromUserForcedJapaneseAutoDetection ||
mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) &&
(encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
// Honor override
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment);
}
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
if ((mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
mCharsetSource == kCharsetFromInitialUserForcedAutoDetection) &&
(encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) {
FinalizeSniffingWithDetector(aFromSegment, aFromSegment.Length(),
false);
} else {
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
}
}
@@ -1053,7 +1116,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
nsresult rv = GetChannel(getter_AddRefs(channel));
if (NS_SUCCEEDED(rv)) {
isSrcdoc = NS_IsSrcdocChannel(channel);
if (!isSrcdoc && mCharsetSource <= kCharsetFromFileURLGuess) {
if (!isSrcdoc && mCharsetSource <= kCharsetFromTopLevelDomain) {
nsCOMPtr<nsIURI> originalURI;
rv = channel->GetOriginalURI(getter_AddRefs(originalURI));
if (NS_SUCCEEDED(rv)) {
@@ -1185,43 +1248,47 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
mInitialEncodingWasFromParentFrame = true;
}
if (mCharsetSource >= kCharsetFromFinalAutoDetection) {
if ((mCharsetSource == kCharsetFromUserForced) &&
mEncoding->IsJapaneseLegacy()) {
// Japanese detector only
if (!mJapaneseDetector) {
mJapaneseDetector = mozilla::JapaneseDetector::Create(true);
if (!(mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
mCharsetSource == kCharsetFromInitialUserForcedAutoDetection ||
mCharsetSource == kCharsetFromFinalUserForcedAutoDetection)) {
if (mCharsetSource >= kCharsetFromFinalJapaneseAutoDetection) {
if ((mCharsetSource == kCharsetFromUserForced) &&
mEncoding->IsJapaneseLegacy()) {
// Japanese detector only
if (!mJapaneseDetector) {
mJapaneseDetector = mozilla::JapaneseDetector::Create(true);
}
mGuessEncoding = false;
} else {
DontGuessEncoding();
}
mGuessEncoding = false;
} else {
DontGuessEncoding();
}
}
// Compute various pref-based special cases
if (!mDecodingLocalFileWithoutTokenizing && mFeedChardet) {
if (mTLD.EqualsLiteral("jp")) {
if (!mJapaneseDetector &&
!StaticPrefs::intl_charset_detector_ng_jp_enabled()) {
mJapaneseDetector = mozilla::JapaneseDetector::Create(true);
// Compute various pref-based special cases
if (!mDecodingLocalFileWithoutTokenizing && mFeedChardet) {
if (mTLD.EqualsLiteral("jp")) {
if (!mJapaneseDetector &&
!StaticPrefs::intl_charset_detector_ng_jp_enabled()) {
mJapaneseDetector = mozilla::JapaneseDetector::Create(true);
}
if (mJapaneseDetector && mEncoding == WINDOWS_1252_ENCODING &&
mCharsetSource <= kCharsetFromTopLevelDomain) {
mCharsetSource = kCharsetFromTopLevelDomain;
mEncoding = SHIFT_JIS_ENCODING;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
} else if ((mTLD.EqualsLiteral("in") &&
!StaticPrefs::intl_charset_detector_ng_in_enabled()) ||
(mTLD.EqualsLiteral("lk") &&
!StaticPrefs::intl_charset_detector_ng_lk_enabled())) {
if (mEncoding == WINDOWS_1252_ENCODING &&
mCharsetSource <= kCharsetFromTopLevelDomain) {
// Avoid breaking font hacks that Chrome doesn't break.
mCharsetSource = kCharsetFromTopLevelDomain;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
DontGuessEncoding();
}
if (mJapaneseDetector && mEncoding == WINDOWS_1252_ENCODING &&
mCharsetSource <= kCharsetFromTopLevelDomain) {
mCharsetSource = kCharsetFromTopLevelDomain;
mEncoding = SHIFT_JIS_ENCODING;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
} else if ((mTLD.EqualsLiteral("in") &&
!StaticPrefs::intl_charset_detector_ng_in_enabled()) ||
(mTLD.EqualsLiteral("lk") &&
!StaticPrefs::intl_charset_detector_ng_lk_enabled())) {
if (mEncoding == WINDOWS_1252_ENCODING &&
mCharsetSource <= kCharsetFromTopLevelDomain) {
// Avoid breaking font hacks that Chrome doesn't break.
mCharsetSource = kCharsetFromTopLevelDomain;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
DontGuessEncoding();
}
}