Bug 482921 part 6 - Use the internal encoding declaration in the XML View Source case. r=Olli.Pettay.
This commit is contained in:
@@ -54,6 +54,8 @@
|
||||
#include "nsIScriptError.h"
|
||||
#include "mozilla/Preferences.h"
|
||||
#include "nsHtml5Highlighter.h"
|
||||
#include "expat_config.h"
|
||||
#include "expat.h"
|
||||
|
||||
using namespace mozilla;
|
||||
|
||||
@@ -403,6 +405,76 @@ nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
|
||||
mFeedChardet = false;
|
||||
}
|
||||
|
||||
void
|
||||
nsHtml5StreamParser::MaybeSetEncodingFromExpat(const PRUnichar* aEncoding)
|
||||
{
|
||||
nsDependentString utf16(aEncoding);
|
||||
nsCAutoString utf8;
|
||||
CopyUTF16toUTF8(utf16, utf8);
|
||||
if (PreferredForInternalEncodingDecl(utf8)) {
|
||||
mCharset.Assign(utf8);
|
||||
mCharsetSource = kCharsetFromMetaTag; // closest for XML
|
||||
}
|
||||
}
|
||||
|
||||
// A separate user data struct is used instead of passing the
|
||||
// nsHtml5StreamParser instance as user data in order to avoid including
|
||||
// expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
|
||||
// Using a separate user data struct also avoids bloating nsHtml5StreamParser
|
||||
// by one pointer.
|
||||
struct UserData {
|
||||
XML_Parser mExpat;
|
||||
nsHtml5StreamParser* mStreamParser;
|
||||
};
|
||||
|
||||
// Using no-namespace handler callbacks to avoid including expat.h in
|
||||
// nsHtml5StreamParser.h, since doing so would cause naming conclicts.
|
||||
static void
|
||||
HandleXMLDeclaration(void* aUserData,
|
||||
const XML_Char* aVersion,
|
||||
const XML_Char* aEncoding,
|
||||
int aStandalone)
|
||||
{
|
||||
UserData* ud = static_cast<UserData*>(aUserData);
|
||||
ud->mStreamParser->MaybeSetEncodingFromExpat(
|
||||
reinterpret_cast<const PRUnichar*>(aEncoding));
|
||||
XML_StopParser(ud->mExpat, false);
|
||||
}
|
||||
|
||||
static void
|
||||
HandleStartElement(void* aUserData,
|
||||
const XML_Char* aName,
|
||||
const XML_Char **aAtts)
|
||||
{
|
||||
UserData* ud = static_cast<UserData*>(aUserData);
|
||||
XML_StopParser(ud->mExpat, false);
|
||||
}
|
||||
|
||||
static void
|
||||
HandleEndElement(void* aUserData,
|
||||
const XML_Char* aName)
|
||||
{
|
||||
UserData* ud = static_cast<UserData*>(aUserData);
|
||||
XML_StopParser(ud->mExpat, false);
|
||||
}
|
||||
|
||||
static void
|
||||
HandleComment(void* aUserData,
|
||||
const XML_Char* aName)
|
||||
{
|
||||
UserData* ud = static_cast<UserData*>(aUserData);
|
||||
XML_StopParser(ud->mExpat, false);
|
||||
}
|
||||
|
||||
static void
|
||||
HandleProcessingInstruction(void* aUserData,
|
||||
const XML_Char* aTarget,
|
||||
const XML_Char* aData)
|
||||
{
|
||||
UserData* ud = static_cast<UserData*>(aUserData);
|
||||
XML_StopParser(ud->mExpat, false);
|
||||
}
|
||||
|
||||
nsresult
|
||||
nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be null
|
||||
PRUint32 aCount,
|
||||
@@ -410,6 +482,70 @@ nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be nul
|
||||
PRUint32 aCountToSniffingLimit)
|
||||
{
|
||||
NS_ASSERTION(IsParserThread(), "Wrong thread!");
|
||||
NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
|
||||
"Should not finalize sniffing when already confident.");
|
||||
if (mMode == VIEW_SOURCE_XML) {
|
||||
static const XML_Memory_Handling_Suite memsuite =
|
||||
{
|
||||
(void *(*)(size_t))moz_xmalloc,
|
||||
(void *(*)(void *, size_t))moz_xrealloc,
|
||||
moz_free
|
||||
};
|
||||
|
||||
static const PRUnichar kExpatSeparator[] = { 0xFFFF, '\0' };
|
||||
|
||||
static const PRUnichar kISO88591[] =
|
||||
{ 'I', 'S', 'O', '-', '8', '8', '5', '9', '-', '1', '\0' };
|
||||
|
||||
UserData ud;
|
||||
ud.mStreamParser = this;
|
||||
|
||||
// If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
|
||||
// documents MUST begin with a BOM. We don't support EBCDIC and such.
|
||||
// Thus, at this point, what we have is garbage or something encoded using
|
||||
// a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
|
||||
// without throwing errors when bytes have the most significant bit set
|
||||
// and without triggering expat's unknown encoding code paths. This is
|
||||
// enough to be able to use expat to parse the XML declaration in order
|
||||
// to extract the encoding name from it.
|
||||
ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
|
||||
XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
|
||||
XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
|
||||
XML_SetCommentHandler(ud.mExpat, HandleComment);
|
||||
XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
|
||||
XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
|
||||
|
||||
XML_Status status = XML_STATUS_OK;
|
||||
if (mSniffingBuffer) {
|
||||
status = XML_Parse(ud.mExpat,
|
||||
reinterpret_cast<const char*>(mSniffingBuffer.get()),
|
||||
mSniffingLength,
|
||||
false);
|
||||
}
|
||||
if (status == XML_STATUS_OK &&
|
||||
mCharsetSource < kCharsetFromMetaTag &&
|
||||
aFromSegment) {
|
||||
status = XML_Parse(ud.mExpat,
|
||||
reinterpret_cast<const char*>(aFromSegment),
|
||||
aCountToSniffingLimit,
|
||||
false);
|
||||
}
|
||||
XML_ParserFree(ud.mExpat);
|
||||
|
||||
if (mCharsetSource < kCharsetFromMetaTag) {
|
||||
// Failed to get an encoding from the XML declaration. XML defaults
|
||||
// confidently to UTF-8 in this case.
|
||||
// It is also possible that the document has an XML declaration that is
|
||||
// longer than 1024 bytes, but that case is not worth worrying about.
|
||||
mCharset.AssignLiteral("UTF-8");
|
||||
mCharsetSource = kCharsetFromMetaTag; // means confident
|
||||
}
|
||||
|
||||
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
|
||||
aCount,
|
||||
aWriteCount);
|
||||
}
|
||||
|
||||
// meta scan failed.
|
||||
if (mCharsetSource >= kCharsetFromHintPrevDoc) {
|
||||
mFeedChardet = false;
|
||||
@@ -537,39 +673,52 @@ nsHtml5StreamParser::SniffStreamBytes(const PRUint8* aFromSegment,
|
||||
}
|
||||
// if we get here, there either was no BOM or the BOM sniffing isn't complete yet
|
||||
|
||||
if (!mMetaScanner) {
|
||||
if (!mMetaScanner && (mMode == NORMAL || mMode == VIEW_SOURCE_HTML)) {
|
||||
mMetaScanner = new nsHtml5MetaScanner();
|
||||
}
|
||||
|
||||
if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
|
||||
// this is the last buffer
|
||||
PRUint32 countToSniffingLimit = NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
|
||||
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + countToSniffingLimit);
|
||||
PRUint32 countToSniffingLimit =
|
||||
NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
|
||||
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML) {
|
||||
nsHtml5ByteReadable readable(aFromSegment, aFromSegment
|
||||
+ countToSniffingLimit);
|
||||
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
|
||||
if (mUnicodeDecoder) {
|
||||
mUnicodeDecoder->SetInputErrorBehavior(
|
||||
nsIUnicodeDecoder::kOnError_Recover);
|
||||
// meta scan successful
|
||||
mCharsetSource = kCharsetFromMetaPrescan;
|
||||
mFeedChardet = false;
|
||||
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
|
||||
mMetaScanner = nsnull;
|
||||
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount,
|
||||
aWriteCount);
|
||||
}
|
||||
}
|
||||
return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
|
||||
countToSniffingLimit);
|
||||
}
|
||||
|
||||
// not the last buffer
|
||||
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML) {
|
||||
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
|
||||
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
|
||||
if (mUnicodeDecoder) {
|
||||
mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
|
||||
// meta scan successful
|
||||
mUnicodeDecoder->SetInputErrorBehavior(
|
||||
nsIUnicodeDecoder::kOnError_Recover);
|
||||
mCharsetSource = kCharsetFromMetaPrescan;
|
||||
mFeedChardet = false;
|
||||
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
|
||||
mMetaScanner = nsnull;
|
||||
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
|
||||
return WriteSniffingBufferAndCurrentSegment(aFromSegment,
|
||||
aCount,
|
||||
aWriteCount);
|
||||
}
|
||||
return FinalizeSniffing(aFromSegment, aCount, aWriteCount, countToSniffingLimit);
|
||||
}
|
||||
|
||||
// not the last buffer
|
||||
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
|
||||
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
|
||||
if (mUnicodeDecoder) {
|
||||
// meta scan successful
|
||||
mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
|
||||
mCharsetSource = kCharsetFromMetaPrescan;
|
||||
mFeedChardet = false;
|
||||
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
|
||||
mMetaScanner = nsnull;
|
||||
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
|
||||
}
|
||||
if (!mSniffingBuffer) {
|
||||
const mozilla::fallible_t fallible = mozilla::fallible_t();
|
||||
mSniffingBuffer = new (fallible)
|
||||
@@ -942,22 +1091,9 @@ nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
|
||||
}
|
||||
|
||||
bool
|
||||
nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
|
||||
nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
|
||||
{
|
||||
// This code needs to stay in sync with
|
||||
// nsHtml5MetaScanner::tryCharset. Unfortunately, the
|
||||
// trickery with member fields there leads to some copy-paste reuse. :-(
|
||||
NS_ASSERTION(IsParserThread(), "Wrong thread!");
|
||||
if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
|
||||
return false;
|
||||
}
|
||||
|
||||
if (mReparseForbidden) {
|
||||
return false; // not reparsing even if we wanted to
|
||||
}
|
||||
|
||||
nsCAutoString newEncoding;
|
||||
CopyUTF16toUTF8(*aEncoding, newEncoding);
|
||||
nsCAutoString newEncoding(aEncoding);
|
||||
newEncoding.Trim(" \t\r\n\f");
|
||||
if (newEncoding.LowerCaseEqualsLiteral("utf-16") ||
|
||||
newEncoding.LowerCaseEqualsLiteral("utf-16be") ||
|
||||
@@ -1004,11 +1140,36 @@ nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
|
||||
// Not a rough ASCII superset
|
||||
return false;
|
||||
}
|
||||
aEncoding.Assign(preferred);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
|
||||
{
|
||||
// This code needs to stay in sync with
|
||||
// nsHtml5MetaScanner::tryCharset. Unfortunately, the
|
||||
// trickery with member fields there leads to some copy-paste reuse. :-(
|
||||
NS_ASSERTION(IsParserThread(), "Wrong thread!");
|
||||
if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
|
||||
return false;
|
||||
}
|
||||
|
||||
if (mReparseForbidden) {
|
||||
return false; // not reparsing even if we wanted to
|
||||
}
|
||||
|
||||
nsCAutoString newEncoding;
|
||||
CopyUTF16toUTF8(*aEncoding, newEncoding);
|
||||
|
||||
if (!PreferredForInternalEncodingDecl(newEncoding)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Avoid having the chardet ask for another restart after this restart
|
||||
// request.
|
||||
mFeedChardet = false;
|
||||
mTreeBuilder->NeedsCharsetSwitchTo(preferred, kCharsetFromMetaTag);
|
||||
mTreeBuilder->NeedsCharsetSwitchTo(newEncoding, kCharsetFromMetaTag);
|
||||
FlushTreeOpsAndDisarmTimer();
|
||||
Interrupt();
|
||||
// the tree op executor will cause the stream parser to terminate
|
||||
|
||||
Reference in New Issue
Block a user