Bug 863728 - Implement the replacement encoding. r=emk.

This commit is contained in:
Henri Sivonen
2013-11-25 10:06:56 +02:00
parent 7b4abc26e3
commit bba76134de
36 changed files with 264 additions and 157 deletions

View File

@@ -238,8 +238,16 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
NS_ASSERTION(IsParserThread(), "Wrong thread!");
if (aConf == eBestAnswer || aConf == eSureAnswer) {
mFeedChardet = false; // just in case
nsAutoCString encoding;
if (!EncodingUtils::FindEncodingForLabel(nsDependentCString(aCharset),
encoding)) {
return NS_OK;
}
if (encoding.EqualsLiteral("replacement")) {
return NS_OK;
}
if (HasDecoder()) {
if (mCharset.Equals(aCharset)) {
if (mCharset.Equals(encoding)) {
NS_ASSERTION(mCharsetSource < kCharsetFromAutoDetection,
"Why are we running chardet at all?");
mCharsetSource = kCharsetFromAutoDetection;
@@ -247,8 +255,7 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
} else {
// We've already committed to a decoder. Request a reload from the
// docshell.
nsAutoCString charset(aCharset);
mTreeBuilder->NeedsCharsetSwitchTo(charset,
mTreeBuilder->NeedsCharsetSwitchTo(encoding,
kCharsetFromAutoDetection,
0);
FlushTreeOpsAndDisarmTimer();
@@ -257,7 +264,7 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
} else {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
mCharset.Assign(aCharset);
mCharset.Assign(encoding);
mCharsetSource = kCharsetFromAutoDetection;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
}
@@ -299,7 +306,8 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const
nsresult rv = NS_OK;
nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
rv = convManager->GetUnicodeDecoderRaw(mCharset.get(),
getter_AddRefs(mUnicodeDecoder));
if (rv == NS_ERROR_UCONV_NOCONV) {
mCharset.AssignLiteral("windows-1252"); // lower case is the raw form
mCharsetSource = kCharsetFromFallback;
@@ -307,16 +315,6 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
}
NS_ENSURE_SUCCESS(rv, rv);
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
}
nsresult
nsHtml5StreamParser::WriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, // can be null
uint32_t aCount,
uint32_t* aWriteCount)
{
NS_ASSERTION(IsParserThread(), "Wrong thread!");
nsresult rv = NS_OK;
if (mSniffingBuffer) {
uint32_t writeCount;
rv = WriteStreamBytes(mSniffingBuffer, mSniffingLength, &writeCount);
@@ -710,29 +708,22 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
// if we get here, there either was no BOM or the BOM sniffing isn't complete
// yet
MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark,
"Should not come here if BOM was found.");
MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent,
"kCharsetFromOtherComponent is for XSLT.");
if (mBomState == BOM_SNIFFING_OVER &&
mCharsetSource >= kCharsetFromChannel) {
// There was no BOM and the charset came from channel or higher. mCharset
// still contains the charset from the channel or higher as set by an
mCharsetSource == kCharsetFromChannel) {
// There was no BOM and the charset came from channel. mCharset
// still contains the charset from the channel as set by an
// earlier call to SetDocumentCharset(), since we didn't find a BOM and
// overwrite mCharset.
nsCOMPtr<nsICharsetConverterManager> convManager =
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
convManager->GetUnicodeDecoder(mCharset.get(),
getter_AddRefs(mUnicodeDecoder));
if (mUnicodeDecoder) {
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nullptr;
return WriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount,
aWriteCount);
} else {
// nsHTMLDocument is supposed to make sure this does not happen. Let's
// deal with this anyway, since who knows how kCharsetFromOtherComponent
// is used.
mCharsetSource = kCharsetFromFallback;
}
// overwrite mCharset. (Note that if the user has overridden the charset,
// we don't come here but check <meta> for XSS-dangerous charsets first.)
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount, aWriteCount);
}
if (!mMetaScanner && (mMode == NORMAL ||
@@ -748,17 +739,31 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
nsHtml5ByteReadable readable(aFromSegment, aFromSegment +
countToSniffingLimit);
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
if (mUnicodeDecoder) {
// meta scan successful
nsAutoCString encoding;
mMetaScanner->sniff(&readable, encoding);
if (!encoding.IsEmpty()) {
// meta scan successful; honor overrides unless meta is XSS-dangerous
if ((mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) &&
EncodingUtils::IsAsciiCompatible(encoding)) {
// Honor override
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment, aCount, aWriteCount);
}
mCharset.Assign(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nullptr;
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount,
aWriteCount);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment, aCount, aWriteCount);
}
}
if (mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) {
// meta not found, honor override
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment, aCount, aWriteCount);
}
return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
countToSniffingLimit);
}
@@ -766,16 +771,23 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
// not the last buffer
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
if (mUnicodeDecoder) {
// meta scan successful
nsAutoCString encoding;
mMetaScanner->sniff(&readable, encoding);
if (!encoding.IsEmpty()) {
// meta scan successful; honor overrides unless meta is XSS-dangerous
if ((mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) &&
EncodingUtils::IsAsciiCompatible(encoding)) {
// Honor override
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount, aWriteCount);
}
mCharset.Assign(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nullptr;
return WriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount,
aWriteCount);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount, aWriteCount);
}
}
@@ -975,9 +987,11 @@ nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
mFeedChardet = false;
// Instantiate the converter here to avoid BOM sniffing.
nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
nsCOMPtr<nsICharsetConverterManager> convManager =
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
rv = convManager->GetUnicodeDecoderRaw(mCharset.get(),
getter_AddRefs(mUnicodeDecoder));
// if we failed to get a decoder, there will be fallback, so don't propagate
// the error.
if (NS_FAILED(rv)) {