Bug 482921 part 6 - Use the internal encoding declaration in the XML View Source case. r=Olli.Pettay.

This commit is contained in:
Henri Sivonen
2011-11-01 13:33:11 +02:00
parent fe260c815b
commit b8278a6ef8
2 changed files with 211 additions and 34 deletions

View File

@@ -54,6 +54,8 @@
#include "nsIScriptError.h"
#include "mozilla/Preferences.h"
#include "nsHtml5Highlighter.h"
#include "expat_config.h"
#include "expat.h"
using namespace mozilla;
@@ -403,6 +405,76 @@ nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
mFeedChardet = false;
}
void
nsHtml5StreamParser::MaybeSetEncodingFromExpat(const PRUnichar* aEncoding)
{
nsDependentString utf16(aEncoding);
nsCAutoString utf8;
CopyUTF16toUTF8(utf16, utf8);
if (PreferredForInternalEncodingDecl(utf8)) {
mCharset.Assign(utf8);
mCharsetSource = kCharsetFromMetaTag; // closest for XML
}
}
// A separate user data struct is used instead of passing the
// nsHtml5StreamParser instance as user data in order to avoid including
// expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
// Using a separate user data struct also avoids bloating nsHtml5StreamParser
// by one pointer.
struct UserData {
XML_Parser mExpat;
nsHtml5StreamParser* mStreamParser;
};
// Using no-namespace handler callbacks to avoid including expat.h in
// nsHtml5StreamParser.h, since doing so would cause naming conclicts.
static void
HandleXMLDeclaration(void* aUserData,
const XML_Char* aVersion,
const XML_Char* aEncoding,
int aStandalone)
{
UserData* ud = static_cast<UserData*>(aUserData);
ud->mStreamParser->MaybeSetEncodingFromExpat(
reinterpret_cast<const PRUnichar*>(aEncoding));
XML_StopParser(ud->mExpat, false);
}
static void
HandleStartElement(void* aUserData,
const XML_Char* aName,
const XML_Char **aAtts)
{
UserData* ud = static_cast<UserData*>(aUserData);
XML_StopParser(ud->mExpat, false);
}
static void
HandleEndElement(void* aUserData,
const XML_Char* aName)
{
UserData* ud = static_cast<UserData*>(aUserData);
XML_StopParser(ud->mExpat, false);
}
static void
HandleComment(void* aUserData,
const XML_Char* aName)
{
UserData* ud = static_cast<UserData*>(aUserData);
XML_StopParser(ud->mExpat, false);
}
static void
HandleProcessingInstruction(void* aUserData,
const XML_Char* aTarget,
const XML_Char* aData)
{
UserData* ud = static_cast<UserData*>(aUserData);
XML_StopParser(ud->mExpat, false);
}
nsresult
nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be null
PRUint32 aCount,
@@ -410,6 +482,70 @@ nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be nul
PRUint32 aCountToSniffingLimit)
{
NS_ASSERTION(IsParserThread(), "Wrong thread!");
NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
"Should not finalize sniffing when already confident.");
if (mMode == VIEW_SOURCE_XML) {
static const XML_Memory_Handling_Suite memsuite =
{
(void *(*)(size_t))moz_xmalloc,
(void *(*)(void *, size_t))moz_xrealloc,
moz_free
};
static const PRUnichar kExpatSeparator[] = { 0xFFFF, '\0' };
static const PRUnichar kISO88591[] =
{ 'I', 'S', 'O', '-', '8', '8', '5', '9', '-', '1', '\0' };
UserData ud;
ud.mStreamParser = this;
// If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
// documents MUST begin with a BOM. We don't support EBCDIC and such.
// Thus, at this point, what we have is garbage or something encoded using
// a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
// without throwing errors when bytes have the most significant bit set
// and without triggering expat's unknown encoding code paths. This is
// enough to be able to use expat to parse the XML declaration in order
// to extract the encoding name from it.
ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
XML_SetCommentHandler(ud.mExpat, HandleComment);
XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
XML_Status status = XML_STATUS_OK;
if (mSniffingBuffer) {
status = XML_Parse(ud.mExpat,
reinterpret_cast<const char*>(mSniffingBuffer.get()),
mSniffingLength,
false);
}
if (status == XML_STATUS_OK &&
mCharsetSource < kCharsetFromMetaTag &&
aFromSegment) {
status = XML_Parse(ud.mExpat,
reinterpret_cast<const char*>(aFromSegment),
aCountToSniffingLimit,
false);
}
XML_ParserFree(ud.mExpat);
if (mCharsetSource < kCharsetFromMetaTag) {
// Failed to get an encoding from the XML declaration. XML defaults
// confidently to UTF-8 in this case.
// It is also possible that the document has an XML declaration that is
// longer than 1024 bytes, but that case is not worth worrying about.
mCharset.AssignLiteral("UTF-8");
mCharsetSource = kCharsetFromMetaTag; // means confident
}
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount,
aWriteCount);
}
// meta scan failed.
if (mCharsetSource >= kCharsetFromHintPrevDoc) {
mFeedChardet = false;
@@ -537,39 +673,52 @@ nsHtml5StreamParser::SniffStreamBytes(const PRUint8* aFromSegment,
}
// if we get here, there either was no BOM or the BOM sniffing isn't complete yet
if (!mMetaScanner) {
if (!mMetaScanner && (mMode == NORMAL || mMode == VIEW_SOURCE_HTML)) {
mMetaScanner = new nsHtml5MetaScanner();
}
if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
// this is the last buffer
PRUint32 countToSniffingLimit = NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + countToSniffingLimit);
PRUint32 countToSniffingLimit =
NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML) {
nsHtml5ByteReadable readable(aFromSegment, aFromSegment
+ countToSniffingLimit);
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
if (mUnicodeDecoder) {
mUnicodeDecoder->SetInputErrorBehavior(
nsIUnicodeDecoder::kOnError_Recover);
// meta scan successful
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nsnull;
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount,
aWriteCount);
}
}
return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
countToSniffingLimit);
}
// not the last buffer
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML) {
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
if (mUnicodeDecoder) {
mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
// meta scan successful
mUnicodeDecoder->SetInputErrorBehavior(
nsIUnicodeDecoder::kOnError_Recover);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nsnull;
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
return WriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount,
aWriteCount);
}
return FinalizeSniffing(aFromSegment, aCount, aWriteCount, countToSniffingLimit);
}
// not the last buffer
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
if (mUnicodeDecoder) {
// meta scan successful
mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nsnull;
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
}
if (!mSniffingBuffer) {
const mozilla::fallible_t fallible = mozilla::fallible_t();
mSniffingBuffer = new (fallible)
@@ -942,22 +1091,9 @@ nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
}
bool
nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
{
// This code needs to stay in sync with
// nsHtml5MetaScanner::tryCharset. Unfortunately, the
// trickery with member fields there leads to some copy-paste reuse. :-(
NS_ASSERTION(IsParserThread(), "Wrong thread!");
if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
return false;
}
if (mReparseForbidden) {
return false; // not reparsing even if we wanted to
}
nsCAutoString newEncoding;
CopyUTF16toUTF8(*aEncoding, newEncoding);
nsCAutoString newEncoding(aEncoding);
newEncoding.Trim(" \t\r\n\f");
if (newEncoding.LowerCaseEqualsLiteral("utf-16") ||
newEncoding.LowerCaseEqualsLiteral("utf-16be") ||
@@ -1004,11 +1140,36 @@ nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
// Not a rough ASCII superset
return false;
}
aEncoding.Assign(preferred);
return true;
}
bool
nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
{
// This code needs to stay in sync with
// nsHtml5MetaScanner::tryCharset. Unfortunately, the
// trickery with member fields there leads to some copy-paste reuse. :-(
NS_ASSERTION(IsParserThread(), "Wrong thread!");
if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
return false;
}
if (mReparseForbidden) {
return false; // not reparsing even if we wanted to
}
nsCAutoString newEncoding;
CopyUTF16toUTF8(*aEncoding, newEncoding);
if (!PreferredForInternalEncodingDecl(newEncoding)) {
return false;
}
// Avoid having the chardet ask for another restart after this restart
// request.
mFeedChardet = false;
mTreeBuilder->NeedsCharsetSwitchTo(preferred, kCharsetFromMetaTag);
mTreeBuilder->NeedsCharsetSwitchTo(newEncoding, kCharsetFromMetaTag);
FlushTreeOpsAndDisarmTimer();
Interrupt();
// the tree op executor will cause the stream parser to terminate