Bug 1686463 - Gather telemetry about automatic encoding detection outcomes. r=chutten,emk

Differential Revision: https://phabricator.services.mozilla.com/D102397
This commit is contained in:
Henri Sivonen
2021-01-24 00:11:07 +00:00
parent 31d21d1b46
commit 1d440d9632
21 changed files with 355 additions and 79 deletions

View File

@@ -228,6 +228,8 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
mInitialEncodingWasFromParentFrame(false),
mHasHadErrors(false),
mDetectorHasSeenNonAscii(false),
mDetectorHadOnlySeenAsciiWhenFirstGuessing(false),
mDecodingLocalFileWithoutTokenizing(false),
mFlushTimer(NS_NewTimer(mEventTarget)),
mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"),
@@ -278,11 +280,36 @@ nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {
: NS_ERROR_NOT_AVAILABLE;
}
int32_t nsHtml5StreamParser::MaybeRollBackSource(int32_t aSource) {
if (aSource ==
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) {
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
}
if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic;
}
if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content) {
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
if (aSource == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
!mDetectorHadOnlySeenAsciiWhenFirstGuessing) {
return kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
}
if (aSource == kCharsetFromFinalUserForcedAutoDetection) {
aSource = kCharsetFromInitialUserForcedAutoDetection;
}
return aSource;
}
void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
if (mJapaneseDetector) {
return;
}
if (!aInitial) {
if (aInitial) {
if (!mDetectorHasSeenNonAscii) {
mDetectorHadOnlySeenAsciiWhenFirstGuessing = true;
}
} else {
mGuessEncoding = false;
}
bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
@@ -291,43 +318,66 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
mCharsetSource != kCharsetFromFinalJapaneseAutoDetection &&
mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
mCharsetSource != kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8 &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD &&
mCharsetSource != kCharsetFromFinalAutoDetectionFile);
auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true);
auto encoding =
forced ? mDetector->Guess(EmptyCString(), true)
forced ? ifHadBeenForced
: mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
auto source =
int32_t source =
aInitial
? (forced ? kCharsetFromInitialUserForcedAutoDetection
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8)
? (forced
? kCharsetFromInitialUserForcedAutoDetection
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic)
: (forced
? kCharsetFromFinalUserForcedAutoDetection
: (mDecodingLocalFileWithoutTokenizing
? kCharsetFromFinalAutoDetectionFile
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8));
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic));
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
if (encoding == ISO_2022_JP_ENCODING) {
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (!mDetectorHasSeenNonAscii) {
source = kCharsetFromInitialAutoDetectionASCII; // deliberately Initial
} else if (ifHadBeenForced == UTF_8_ENCODING) {
source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8;
} else if (encoding != ifHadBeenForced) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (source == kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8) {
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
} else if (source ==
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) {
if (encoding == ISO_2022_JP_ENCODING) {
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (!mDetectorHasSeenNonAscii) {
source = kCharsetFromInitialAutoDetectionASCII;
} else if (ifHadBeenForced == UTF_8_ENCODING) {
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
} else if (encoding != ifHadBeenForced) {
source =
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
}
if (HasDecoder() && !mDecodingLocalFileWithoutTokenizing) {
if (mEncoding == encoding) {
MOZ_ASSERT(mCharsetSource < source, "Why are we running chardet at all?");
MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII ||
mCharsetSource < source,
"Why are we running chardet at all?");
// Source didn't actually change between initial and final, so roll it
// back for future telemetry purposes, while taking into account the final
// UTF-8ness. https://bugzilla.mozilla.org/show_bug.cgi?id=1686463
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8;
} else if (source == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
} else if (source == kCharsetFromFinalUserForcedAutoDetection) {
source = kCharsetFromInitialUserForcedAutoDetection;
}
mCharsetSource = source;
// back for telemetry purposes.
mCharsetSource = MaybeRollBackSource(source);
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else {
MOZ_ASSERT(mCharsetSource < kCharsetFromFinalJapaneseAutoDetection ||
@@ -341,6 +391,11 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
} else {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
if (mCharsetSource == kCharsetUninitialized && aEof) {
// The document is so short that the initial buffer is the last
// buffer.
source = MaybeRollBackSource(source);
}
mEncoding = encoding;
mCharsetSource = source;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
@@ -383,7 +438,7 @@ void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer,
if (mJapaneseDetector) {
FeedJapaneseDetector(aBuffer, aLast);
} else {
Unused << mDetector->Feed(aBuffer, aLast);
mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, aLast);
}
}