Bug 1543077 part 2 - Use mozilla::JapaneseDetector in the HTML parser. r=emk

Depends on D27792

Differential Revision: https://phabricator.services.mozilla.com/D27793
This commit is contained in:
Henri Sivonen
2019-05-27 07:56:17 +00:00
parent b4f6617c10
commit 45a30eae39
5 changed files with 237 additions and 69 deletions

View File

@@ -31,7 +31,6 @@
#include "nsIThreadRetargetableRequest.h"
#include "nsPrintfCString.h"
#include "nsNetUtil.h"
#include "nsUdetXPCOMWrapper.h"
#include "nsXULAppAPI.h"
#include "mozilla/SchedulerGroup.h"
#include "nsJSEnvironment.h"
@@ -156,7 +155,7 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mBomState(eBomState::BOM_SNIFFING_NOT_STARTED),
mCharsetSource(kCharsetUninitialized),
mEncoding(WINDOWS_1252_ENCODING),
mFeedChardetIfEncoding(nullptr),
mFeedChardet(true),
mReparseForbidden(false),
mLastBuffer(nullptr), // Will be filled when starting
mExecutor(aExecutor),
@@ -181,6 +180,7 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mEventTarget(nsHtml5Module::GetStreamParserThread()->SerialEventTarget()),
mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor)),
mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
mJapaneseDetector(mozilla::JapaneseDetector::Create(true)),
mInitialEncodingWasFromParentFrame(false),
mHasHadErrors(false),
mDecodingLocalFileAsUTF8(false),
@@ -210,16 +210,11 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
nsAutoCString detectorName;
Preferences::GetLocalizedCString("intl.charset.detector", detectorName);
if (!detectorName.IsEmpty()) {
// We recognize one of the three magic strings for the following languages.
// We recognize one of the two magic strings for Russian and Ukranian.
if (detectorName.EqualsLiteral("ruprob")) {
mChardet = new nsRUProbDetector();
mFeedChardetIfEncoding = WINDOWS_1251_ENCODING;
} else if (detectorName.EqualsLiteral("ukprob")) {
mChardet = new nsUKProbDetector();
mFeedChardetIfEncoding = WINDOWS_1251_ENCODING;
} else if (detectorName.EqualsLiteral("ja_parallel_state_machine")) {
mChardet = new nsJAPSMDetector();
mFeedChardetIfEncoding = SHIFT_JIS_ENCODING;
}
if (mChardet) {
(void)mChardet->Init(this);
@@ -263,7 +258,7 @@ NS_IMETHODIMP
nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf) {
NS_ASSERTION(IsParserThread(), "Wrong thread!");
if (aConf == eBestAnswer || aConf == eSureAnswer) {
mFeedChardetIfEncoding = nullptr; // just in case
mFeedChardet = false; // just in case
auto encoding =
Encoding::ForLabelNoReplacement(nsDependentCString(aCharset));
if (!encoding) {
@@ -271,8 +266,8 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf) {
}
if (HasDecoder()) {
if (mEncoding == encoding) {
NS_ASSERTION(mCharsetSource < kCharsetFromAutoDetection,
"Why are we running chardet at all?");
MOZ_ASSERT(mCharsetSource < kCharsetFromAutoDetection,
"Why are we running chardet at all?");
mCharsetSource = kCharsetFromAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else {
@@ -294,6 +289,55 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf) {
return NS_OK;
}
void nsHtml5StreamParser::FeedJapaneseDetector(Span<const uint8_t> aBuffer,
bool aLast) {
const Encoding* detected = mJapaneseDetector->Feed(aBuffer, aLast);
if (!detected) {
return;
}
mFeedChardet = false;
if (detected == mEncoding) {
MOZ_ASSERT(mCharsetSource < kCharsetFromAutoDetection,
"Why are we running chardet at all?");
mCharsetSource = kCharsetFromAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else if (HasDecoder()) {
// We've already committed to a decoder. Request a reload from the
// docshell.
mTreeBuilder->NeedsCharsetSwitchTo(WrapNotNull(detected),
kCharsetFromAutoDetection, 0);
FlushTreeOpsAndDisarmTimer();
Interrupt();
} else {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
mEncoding = WrapNotNull(detected);
mCharsetSource = kCharsetFromAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
}
void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer,
bool aLast) {
if (mEncoding == SHIFT_JIS_ENCODING) {
FeedJapaneseDetector(aBuffer, aLast);
} else if (mEncoding == WINDOWS_1251_ENCODING) {
if (!aBuffer.IsEmpty()) {
bool dontFeed = false;
mozilla::Unused << mChardet->DoIt((const char*)aBuffer.Elements(),
aBuffer.Length(), &dontFeed);
if (dontFeed) {
mFeedChardet = false;
}
}
if (aLast) {
mozilla::Unused << mChardet->Done();
}
} else {
mFeedChardet = false;
}
}
void nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL) {
if (recordreplay::IsRecordingOrReplaying()) {
nsAutoCString spec;
@@ -354,7 +398,7 @@ nsresult nsHtml5StreamParser::SetupDecodingFromBom(
mDecodingLocalFileAsUTF8 = false;
mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
mCharsetSource = kCharsetFromByteOrderMark;
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
mSniffingBuffer = nullptr;
mMetaScanner = nullptr;
@@ -412,7 +456,7 @@ void nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(
}
mCharsetSource = kCharsetFromIrreversibleAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
mTreeBuilder->MaybeComplainAboutCharset("EncBomlessUtf16", true, 0);
}
@@ -548,7 +592,7 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span<const uint8_t> aFromSegment,
// meta scan failed.
if (mCharsetSource >= kCharsetFromHintPrevDoc) {
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
}
// Check for BOMless UTF-16 with Basic
@@ -556,40 +600,28 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span<const uint8_t> aFromSegment,
SniffBOMlessUTF16BasicLatin(aFromSegment.To(aCountToSniffingLimit));
// the charset may have been set now
// maybe try chardet now;
if ((mFeedChardetIfEncoding == mEncoding) && !mDecodingLocalFileAsUTF8) {
bool dontFeed;
nsresult rv;
if (mFeedChardet && !mDecodingLocalFileAsUTF8) {
if (mSniffingBuffer) {
rv = mChardet->DoIt((const char*)mSniffingBuffer.get(), mSniffingLength,
&dontFeed);
if (dontFeed) {
mFeedChardetIfEncoding = nullptr;
}
NS_ENSURE_SUCCESS(rv, rv);
FeedDetector(MakeSpan(mSniffingBuffer.get(), mSniffingLength), false);
}
if ((mFeedChardetIfEncoding == mEncoding) && !aFromSegment.IsEmpty()) {
rv = mChardet->DoIt(
(const char*)aFromSegment.Elements(),
// Avoid buffer boundary-dependent behavior when
// reparsing is forbidden. If reparse is forbidden,
// act as if we only saw the first 1024 bytes.
// When reparsing isn't forbidden, buffer boundaries
// can have an effect on whether the page is loaded
// once or twice. :-(
mReparseForbidden ? aCountToSniffingLimit : aFromSegment.Length(),
&dontFeed);
if (dontFeed) {
mFeedChardetIfEncoding = nullptr;
}
NS_ENSURE_SUCCESS(rv, rv);
if (mFeedChardet && !aFromSegment.IsEmpty()) {
// Avoid buffer boundary-dependent behavior when
// reparsing is forbidden. If reparse is forbidden,
// act as if we only saw the first 1024 bytes.
// When reparsing isn't forbidden, buffer boundaries
// can have an effect on whether the page is loaded
// once or twice. :-(
FeedDetector(mReparseForbidden ? aFromSegment.To(aCountToSniffingLimit)
: aFromSegment,
false);
}
if ((mFeedChardetIfEncoding == mEncoding) && (aEof || mReparseForbidden)) {
// mReparseForbidden is checked so that we get to use the sniffing
// buffer with the best guess so far if we aren't allowed to guess
// better later.
mFeedChardetIfEncoding = nullptr;
rv = mChardet->Done();
NS_ENSURE_SUCCESS(rv, rv);
if (mFeedChardet && aEof &&
(!mReparseForbidden ||
aCountToSniffingLimit == aFromSegment.Length())) {
// Don't signal EOF if reparse is forbidden and we didn't pass all input
// to the detector above.
mFeedChardet = false;
FeedDetector(Span<const uint8_t>(), true);
}
// fall thru; callback may have changed charset
}
@@ -600,7 +632,7 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span<const uint8_t> aFromSegment,
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else if (mMode == LOAD_AS_DATA && mCharsetSource == kCharsetFromFallback) {
NS_ASSERTION(mReparseForbidden, "Reparse should be forbidden for XHR");
NS_ASSERTION(!mFeedChardetIfEncoding, "Should not feed chardet for XHR");
NS_ASSERTION(!mFeedChardet, "Should not feed chardet for XHR");
NS_ASSERTION(mEncoding == UTF_8_ENCODING, "XHR should default to UTF-8");
// Now mark charset source as non-weak to signal that we have a decision
mCharsetSource = kCharsetFromDocTypeDefault;
@@ -687,7 +719,7 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
// earlier call to SetDocumentCharset(), since we didn't find a BOM and
// overwrite mEncoding. (Note that if the user has overridden the charset,
// we don't come here but check <meta> for XSS-dangerous charsets first.)
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
}
@@ -724,7 +756,7 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
}
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment);
@@ -761,7 +793,7 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
}
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
}
@@ -856,7 +888,7 @@ void nsHtml5StreamParser::ReDecodeLocalFile() {
void nsHtml5StreamParser::CommitLocalFileToUTF8() {
MOZ_ASSERT(mDecodingLocalFileAsUTF8);
mDecodingLocalFileAsUTF8 = false;
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
mEncoding = UTF_8_ENCODING;
mCharsetSource = kCharsetFromFileURLGuess;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
@@ -1000,7 +1032,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
// This is the old Gecko behavior but the HTML5 spec disagrees.
// Don't reparse on POST.
mReparseForbidden = true;
mFeedChardetIfEncoding = nullptr; // can't restart anyway
mFeedChardet = false; // can't restart anyway
}
}
@@ -1032,7 +1064,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
}
if (mCharsetSource >= kCharsetFromAutoDetection) {
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
}
if (mCharsetSource < kCharsetFromUtf8OnlyMime) {
@@ -1045,7 +1077,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
// a browsing context. In the latter case, there's no need to remove the
// BOM manually here, because the UTF-8 decoder removes it.
mReparseForbidden = true;
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
// Instantiate the converter here to avoid BOM sniffing.
mDecodingLocalFileAsUTF8 = false;
@@ -1085,8 +1117,9 @@ void nsHtml5StreamParser::DoStopRequest() {
return;
}
}
if ((mFeedChardetIfEncoding == mEncoding) && !mDecodingLocalFileAsUTF8) {
mChardet->Done();
if (mFeedChardet && !mDecodingLocalFileAsUTF8) {
mFeedChardet = false;
FeedDetector(Span<uint8_t>(), true);
}
MOZ_ASSERT(mUnicodeDecoder,
@@ -1246,13 +1279,8 @@ void nsHtml5StreamParser::DoDataAvailable(Span<const uint8_t> aBuffer) {
nsresult rv;
if (HasDecoder()) {
if ((mFeedChardetIfEncoding == mEncoding) && !mDecodingLocalFileAsUTF8) {
bool dontFeed;
mChardet->DoIt((const char*)aBuffer.Elements(), aBuffer.Length(),
&dontFeed);
if (dontFeed) {
mFeedChardetIfEncoding = nullptr;
}
if (mFeedChardet && !mDecodingLocalFileAsUTF8) {
FeedDetector(aBuffer, false);
}
rv = WriteStreamBytes(aBuffer);
} else {
@@ -1411,7 +1439,7 @@ const Encoding* nsHtml5StreamParser::PreferredForInternalEncodingDecl(
}
}
mCharsetSource = kCharsetFromMetaTag; // become confident
mFeedChardetIfEncoding = nullptr; // don't feed chardet when confident
mFeedChardet = false; // don't feed chardet when confident
return nullptr;
}
@@ -1450,7 +1478,7 @@ bool nsHtml5StreamParser::internalEncodingDeclaration(nsHtml5String aEncoding) {
// Avoid having the chardet ask for another restart after this restart
// request.
mFeedChardetIfEncoding = nullptr;
mFeedChardet = false;
mTreeBuilder->NeedsCharsetSwitchTo(WrapNotNull(encoding), kCharsetFromMetaTag,
mTokenizer->getLineNumber());
FlushTreeOpsAndDisarmTimer();