Handle unclosed and malformed TITLE tag. b=42945, r=heikki, sr=jst

2003-01-03 19:21:20 +00:00
parent 4c0f06dcb5
commit 71e9f50a14
6 changed files with 286 additions and 194 deletions
--- a/htmlparser/src/CNavDTD.cpp
+++ b/htmlparser/src/CNavDTD.cpp
@@ -555,56 +555,106 @@ nsresult CNavDTD::BuildModel(nsIParser* aParser,nsITokenizer* aTokenizer,nsIToke
  return result;
 }

+/**
+ * @param aTarget - Tag that was neglected in the document.
+ * @param aType   - Specifies the type of the target. Ex. start, end, text, etc.
+ * @param aParser - Parser to drive this process
+ * @param aSink   - HTML Content sink
+ */
+nsresult
+CNavDTD::BuildNeglectedTarget(eHTMLTags aTarget,
+                              eHTMLTokenTypes aType,
+                              nsIParser* aParser,
+                              nsIContentSink* aSink)
+{ 
+  NS_ASSERTION(mTokenizer, "tokenizer is null! unable to build target.");
+  NS_ASSERTION(mTokenAllocator, "unable to create tokens without an allocator.");
+  if (!mTokenizer || !mTokenAllocator)
+    return NS_OK;
+  if (eHTMLTag_unknown != mSkipTarget && eHTMLTag_title == aTarget) {
+    PRInt32 size = mSkippedContent.GetSize();
+    // Note: The first location of the skipped content 
+    // deque contains the opened-skip-target. Do not include
+    // that when guessing title contents. The term "guessing" 
+    // is used because the document did not contain an end title
+    // and hence it's almost impossible to know what markup
+    // should belong in the title. The assumption used here is that
+    // if the markup is anything other than "text", or "entity" or,
+    // "whitespace" then it's least likely to belong in the title.
+    PRInt32 index;
+    for (index = 1; index < size; index++) {
+      CHTMLToken* token = 
+        NS_REINTERPRET_CAST(CHTMLToken*, mSkippedContent.ObjectAt(index));
+      NS_ASSERTION(token, "there is a null token in the skipped content list!");
+      eHTMLTokenTypes type = eHTMLTokenTypes(token->GetTokenType());
+      if (eToken_whitespace != type && 
+          eToken_newline != type    && 
+          eToken_text != type       && 
+          eToken_entity != type     &&
+          eToken_attribute != type) {
+        // Now pop the tokens that do not belong ( just a guess work )
+        // in the title and push them into the tokens queue.
+        while (size != index++) {
+          token = NS_REINTERPRET_CAST(CHTMLToken*, mSkippedContent.Pop()); 
+          mTokenizer->PushTokenFront(token);
+        }
+        break;
+      }
+    }
+  }
+  CHTMLToken* target = 
+      NS_STATIC_CAST(CHTMLToken*, mTokenAllocator->CreateTokenOfType(aType, aTarget));
+  mTokenizer->PushTokenFront(target);
+  return BuildModel(aParser, mTokenizer, 0, aSink);
+}
+
 /**
 * 
 * @update  gess5/18/98
 * @param 
 * @return 
 */
-nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParser* aParser,nsIContentSink* aSink){
-  nsresult result=NS_OK;
+nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,
+                                PRBool aNotifySink,
+                                nsIParser* aParser,
+                                nsIContentSink* aSink)
+{
+  if (!aSink)
+    return NS_OK;
+  nsresult result = NS_OK;
+  if (aParser && aNotifySink) { 
+    if (NS_OK == anErrorCode) {
+      if (eHTMLTag_unknown != mSkipTarget) {
+        // Looks like there is an open target ( ex. <title>, <textarea> ).
+        // Create a matching target to handle the unclosed target.
+        result = BuildNeglectedTarget(mSkipTarget, eToken_end, aParser, aSink);
+        NS_ENSURE_SUCCESS(result , result);
+      }
+      if (!(mFlags & (NS_DTD_FLAG_HAD_FRAMESET | NS_DTD_FLAG_HAD_BODY))) {
+        // This document is not a frameset document, however, it did not contain
+        // a body tag either. So, make one!. Note: Body tag is optional per spec..
+        result = BuildNeglectedTarget(eHTMLTag_body, eToken_start, aParser, aSink);
+        NS_ENSURE_SUCCESS(result , result);
+      }
+      if (mFlags & NS_DTD_FLAG_MISPLACED_CONTENT) {
+        // Looks like the misplaced contents are not processed yet.
+        // Here is our last chance to handle the misplaced content.
+        mFlags &= ~NS_DTD_FLAG_MISPLACED_CONTENT; 
        
-  if(aSink) { 
+        // mContextTopIndex refers to the misplaced content's legal parent index.
+        result = HandleSavedTokens(mBodyContext->mContextTopIndex);
+        NS_ENSURE_SUCCESS(result, result);

-    if((NS_OK==anErrorCode) && !(mFlags & (NS_DTD_FLAG_HAD_BODY | NS_DTD_FLAG_HAD_FRAMESET))) {
-
-      mSkipTarget=eHTMLTag_unknown; //clear this in case we were searching earlier.
-
-      if (mTokenAllocator) {
-        CStartToken *theToken = NS_STATIC_CAST(CStartToken*, mTokenAllocator->CreateTokenOfType(eToken_start,eHTMLTag_body,NS_LITERAL_STRING("body")));
-        mTokenizer->PushTokenFront(theToken); //this token should get pushed on the context stack, don't recycle it 
-        result = BuildModel(aParser,mTokenizer, 0, aSink);
+        mBodyContext->mContextTopIndex = -1; 
      }          
-    } 
-
-    if(aParser && (NS_OK==result)){ 
-      if(aNotifySink){ 
-        if((NS_OK==anErrorCode) && (mBodyContext->GetCount()>0)) {
-          if (mTokenAllocator) {
-            if(mSkipTarget) {
-              CHTMLToken* theEndToken=nsnull;
-              theEndToken=NS_STATIC_CAST(CHTMLToken*,mTokenAllocator->CreateTokenOfType(eToken_end,mSkipTarget));
-              if(theEndToken) {
-                result=HandleToken(theEndToken,mParser);
-              }
-            }
-            if(mFlags & NS_DTD_FLAG_MISPLACED_CONTENT) {
-              // Create an end table token to flush tokens off the misplaced list...
-              CHTMLToken* theTableToken=NS_STATIC_CAST(CHTMLToken*,mTokenAllocator->CreateTokenOfType(eToken_end,eHTMLTag_table));
-              if(theTableToken) {
-                result=HandleToken(theTableToken,mParser);
-              }
-            }
-          }
-          if(NS_SUCCEEDED(result)) {
-            eHTMLTags theTarget; 
-
      //now let's disable style handling to save time when closing remaining stack members...
      mFlags &= ~NS_DTD_FLAG_ENABLE_RESIDUAL_STYLE;
-
-            while(mBodyContext->GetCount() > 0) { 
-              theTarget = mBodyContext->Last(); 
-              CloseContainersTo(theTarget,PR_FALSE); 
+      while (mBodyContext->GetCount() > 0) { 
+        result = CloseContainersTo(mBodyContext->Last(), PR_FALSE);
+        if (NS_FAILED(result)) {
+          //No matter what, you need to call did build model.
+          aSink->DidBuildModel(0);
+          return result;
        }
      } 
    } 
@@ -612,7 +662,7 @@ nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParse
      //If you're here, then an error occured, but we still have nodes on the stack.
      //At a minimum, we should grab the nodes and recycle them.
      //Just to be correct, we'll also recycle the nodes.
-          while(mBodyContext->GetCount() > 0) { 
+      while (mBodyContext->GetCount() > 0) { 
        nsEntryStack* theChildStyles = 0;
        nsCParserNode* theNode = mBodyContext->Pop(theChildStyles);
        IF_DELETE(theChildStyles,&mNodeAllocator);
@@ -626,30 +676,29 @@ nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParse
 #ifdef  ENABLE_CRC

      //let's only grab this state once! 
-        if(!gShowCRC) { 
+    if (!gShowCRC) { 
      gShowCRC=1; //this only indicates we'll not initialize again. 
      char* theEnvString = PR_GetEnv("RICKG_CRC"); 
-          if(theEnvString){ 
-            if(('1'==theEnvString[0]) || ('Y'==theEnvString[0]) || ('y'==theEnvString[0])){ 
+      if (theEnvString){ 
+        if (('1'== theEnvString[0]) || ('Y'== theEnvString[0]) || ('y'== theEnvString[0])){ 
          gShowCRC=2;  //this indicates that the CRC flag was found in the environment. 
        } 
      } 
    } 

-        if(2==gShowCRC) { 
-          if(mComputedCRC32!=mExpectedCRC32) { 
-            if(mExpectedCRC32!=0) { 
+    if (2 == gShowCRC) { 
+      if (mComputedCRC32 != mExpectedCRC32) { 
+        if (mExpectedCRC32 != 0) { 
          printf("CRC Computed: %u  Expected CRC: %u\n,",mComputedCRC32,mExpectedCRC32); 
          result = aSink->DidBuildModel(2); 
        } 
        else { 
          printf("Computed CRC: %u.\n",mComputedCRC32); 
          result = aSink->DidBuildModel(3); 
+          NS_ENSURE_SUCCESS(result, result);
        } 
      } 
-          else result = aSink->DidBuildModel(0); 
    }
-        else result=aSink->DidBuildModel(0); 
 #endif

    MOZ_TIMER_DEBUGLOG(("Start: Parse Time: CNavDTD::DidBuildModel(), this=%p\n", this));
@@ -658,18 +707,14 @@ nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParse
      //Now make sure the misplaced content list is empty,
      //by forcefully recycling any tokens we might find there.

-        CToken* theToken=0;
-        while((theToken=(CToken*)mMisplacedContent.Pop())) {
+    CToken* theToken = 0;
+    while ((theToken = (CToken*)mMisplacedContent.Pop())) {
      IF_FREE(theToken, mTokenAllocator);
    }
-      } 
  } //if aparser

  //No matter what, you need to call did build model.
-    result=aSink->DidBuildModel(0); 
-
-  } //if asink
-  return result;
+  return aSink->DidBuildModel(0); 
 }

 NS_IMETHODIMP_(void) 
--- a/htmlparser/src/CNavDTD.h
+++ b/htmlparser/src/CNavDTD.h
@@ -259,7 +259,8 @@ public:
    nsresult    HandleScriptToken(const nsIParserNode *aNode);
    nsresult    HandleProcessingInstructionToken(CToken* aToken);
    nsresult    HandleDocTypeDeclToken(CToken* aToken);
-
+    nsresult    BuildNeglectedTarget(eHTMLTags aTarget, eHTMLTokenTypes aType,
+                                     nsIParser* aParser, nsIContentSink* aSink);

    //*************************************************
    //these cover methods mimic the sink, and are used
--- a/htmlparser/src/nsHTMLTokens.cpp
+++ b/htmlparser/src/nsHTMLTokens.cpp
@@ -1734,8 +1734,8 @@ nsresult CAttributeToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 a
    { PRUnichar(' '), PRUnichar('"'), 
      PRUnichar('='), PRUnichar('\n'), 
      PRUnichar('\r'), PRUnichar('\t'), 
-      PRUnichar('>'), PRUnichar('\b'),
-      PRUnichar(0) };
+      PRUnichar('>'), PRUnichar('<'),
+      PRUnichar('\b'), PRUnichar(0) };
    static const nsReadEndCondition theEndCondition(theTerminalsChars);

    nsReadingIterator<PRUnichar> start, end;
--- a/parser/htmlparser/src/CNavDTD.cpp
+++ b/parser/htmlparser/src/CNavDTD.cpp
@@ -555,56 +555,106 @@ nsresult CNavDTD::BuildModel(nsIParser* aParser,nsITokenizer* aTokenizer,nsIToke
  return result;
 }

+/**
+ * @param aTarget - Tag that was neglected in the document.
+ * @param aType   - Specifies the type of the target. Ex. start, end, text, etc.
+ * @param aParser - Parser to drive this process
+ * @param aSink   - HTML Content sink
+ */
+nsresult
+CNavDTD::BuildNeglectedTarget(eHTMLTags aTarget,
+                              eHTMLTokenTypes aType,
+                              nsIParser* aParser,
+                              nsIContentSink* aSink)
+{ 
+  NS_ASSERTION(mTokenizer, "tokenizer is null! unable to build target.");
+  NS_ASSERTION(mTokenAllocator, "unable to create tokens without an allocator.");
+  if (!mTokenizer || !mTokenAllocator)
+    return NS_OK;
+  if (eHTMLTag_unknown != mSkipTarget && eHTMLTag_title == aTarget) {
+    PRInt32 size = mSkippedContent.GetSize();
+    // Note: The first location of the skipped content 
+    // deque contains the opened-skip-target. Do not include
+    // that when guessing title contents. The term "guessing" 
+    // is used because the document did not contain an end title
+    // and hence it's almost impossible to know what markup
+    // should belong in the title. The assumption used here is that
+    // if the markup is anything other than "text", or "entity" or,
+    // "whitespace" then it's least likely to belong in the title.
+    PRInt32 index;
+    for (index = 1; index < size; index++) {
+      CHTMLToken* token = 
+        NS_REINTERPRET_CAST(CHTMLToken*, mSkippedContent.ObjectAt(index));
+      NS_ASSERTION(token, "there is a null token in the skipped content list!");
+      eHTMLTokenTypes type = eHTMLTokenTypes(token->GetTokenType());
+      if (eToken_whitespace != type && 
+          eToken_newline != type    && 
+          eToken_text != type       && 
+          eToken_entity != type     &&
+          eToken_attribute != type) {
+        // Now pop the tokens that do not belong ( just a guess work )
+        // in the title and push them into the tokens queue.
+        while (size != index++) {
+          token = NS_REINTERPRET_CAST(CHTMLToken*, mSkippedContent.Pop()); 
+          mTokenizer->PushTokenFront(token);
+        }
+        break;
+      }
+    }
+  }
+  CHTMLToken* target = 
+      NS_STATIC_CAST(CHTMLToken*, mTokenAllocator->CreateTokenOfType(aType, aTarget));
+  mTokenizer->PushTokenFront(target);
+  return BuildModel(aParser, mTokenizer, 0, aSink);
+}
+
 /**
 * 
 * @update  gess5/18/98
 * @param 
 * @return 
 */
-nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParser* aParser,nsIContentSink* aSink){
-  nsresult result=NS_OK;
+nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,
+                                PRBool aNotifySink,
+                                nsIParser* aParser,
+                                nsIContentSink* aSink)
+{
+  if (!aSink)
+    return NS_OK;
+  nsresult result = NS_OK;
+  if (aParser && aNotifySink) { 
+    if (NS_OK == anErrorCode) {
+      if (eHTMLTag_unknown != mSkipTarget) {
+        // Looks like there is an open target ( ex. <title>, <textarea> ).
+        // Create a matching target to handle the unclosed target.
+        result = BuildNeglectedTarget(mSkipTarget, eToken_end, aParser, aSink);
+        NS_ENSURE_SUCCESS(result , result);
+      }
+      if (!(mFlags & (NS_DTD_FLAG_HAD_FRAMESET | NS_DTD_FLAG_HAD_BODY))) {
+        // This document is not a frameset document, however, it did not contain
+        // a body tag either. So, make one!. Note: Body tag is optional per spec..
+        result = BuildNeglectedTarget(eHTMLTag_body, eToken_start, aParser, aSink);
+        NS_ENSURE_SUCCESS(result , result);
+      }
+      if (mFlags & NS_DTD_FLAG_MISPLACED_CONTENT) {
+        // Looks like the misplaced contents are not processed yet.
+        // Here is our last chance to handle the misplaced content.
+        mFlags &= ~NS_DTD_FLAG_MISPLACED_CONTENT; 
        
-  if(aSink) { 
+        // mContextTopIndex refers to the misplaced content's legal parent index.
+        result = HandleSavedTokens(mBodyContext->mContextTopIndex);
+        NS_ENSURE_SUCCESS(result, result);

-    if((NS_OK==anErrorCode) && !(mFlags & (NS_DTD_FLAG_HAD_BODY | NS_DTD_FLAG_HAD_FRAMESET))) {
-
-      mSkipTarget=eHTMLTag_unknown; //clear this in case we were searching earlier.
-
-      if (mTokenAllocator) {
-        CStartToken *theToken = NS_STATIC_CAST(CStartToken*, mTokenAllocator->CreateTokenOfType(eToken_start,eHTMLTag_body,NS_LITERAL_STRING("body")));
-        mTokenizer->PushTokenFront(theToken); //this token should get pushed on the context stack, don't recycle it 
-        result = BuildModel(aParser,mTokenizer, 0, aSink);
+        mBodyContext->mContextTopIndex = -1; 
      }          
-    } 
-
-    if(aParser && (NS_OK==result)){ 
-      if(aNotifySink){ 
-        if((NS_OK==anErrorCode) && (mBodyContext->GetCount()>0)) {
-          if (mTokenAllocator) {
-            if(mSkipTarget) {
-              CHTMLToken* theEndToken=nsnull;
-              theEndToken=NS_STATIC_CAST(CHTMLToken*,mTokenAllocator->CreateTokenOfType(eToken_end,mSkipTarget));
-              if(theEndToken) {
-                result=HandleToken(theEndToken,mParser);
-              }
-            }
-            if(mFlags & NS_DTD_FLAG_MISPLACED_CONTENT) {
-              // Create an end table token to flush tokens off the misplaced list...
-              CHTMLToken* theTableToken=NS_STATIC_CAST(CHTMLToken*,mTokenAllocator->CreateTokenOfType(eToken_end,eHTMLTag_table));
-              if(theTableToken) {
-                result=HandleToken(theTableToken,mParser);
-              }
-            }
-          }
-          if(NS_SUCCEEDED(result)) {
-            eHTMLTags theTarget; 
-
      //now let's disable style handling to save time when closing remaining stack members...
      mFlags &= ~NS_DTD_FLAG_ENABLE_RESIDUAL_STYLE;
-
-            while(mBodyContext->GetCount() > 0) { 
-              theTarget = mBodyContext->Last(); 
-              CloseContainersTo(theTarget,PR_FALSE); 
+      while (mBodyContext->GetCount() > 0) { 
+        result = CloseContainersTo(mBodyContext->Last(), PR_FALSE);
+        if (NS_FAILED(result)) {
+          //No matter what, you need to call did build model.
+          aSink->DidBuildModel(0);
+          return result;
        }
      } 
    } 
@@ -612,7 +662,7 @@ nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParse
      //If you're here, then an error occured, but we still have nodes on the stack.
      //At a minimum, we should grab the nodes and recycle them.
      //Just to be correct, we'll also recycle the nodes.
-          while(mBodyContext->GetCount() > 0) { 
+      while (mBodyContext->GetCount() > 0) { 
        nsEntryStack* theChildStyles = 0;
        nsCParserNode* theNode = mBodyContext->Pop(theChildStyles);
        IF_DELETE(theChildStyles,&mNodeAllocator);
@@ -626,30 +676,29 @@ nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParse
 #ifdef  ENABLE_CRC

      //let's only grab this state once! 
-        if(!gShowCRC) { 
+    if (!gShowCRC) { 
      gShowCRC=1; //this only indicates we'll not initialize again. 
      char* theEnvString = PR_GetEnv("RICKG_CRC"); 
-          if(theEnvString){ 
-            if(('1'==theEnvString[0]) || ('Y'==theEnvString[0]) || ('y'==theEnvString[0])){ 
+      if (theEnvString){ 
+        if (('1'== theEnvString[0]) || ('Y'== theEnvString[0]) || ('y'== theEnvString[0])){ 
          gShowCRC=2;  //this indicates that the CRC flag was found in the environment. 
        } 
      } 
    } 

-        if(2==gShowCRC) { 
-          if(mComputedCRC32!=mExpectedCRC32) { 
-            if(mExpectedCRC32!=0) { 
+    if (2 == gShowCRC) { 
+      if (mComputedCRC32 != mExpectedCRC32) { 
+        if (mExpectedCRC32 != 0) { 
          printf("CRC Computed: %u  Expected CRC: %u\n,",mComputedCRC32,mExpectedCRC32); 
          result = aSink->DidBuildModel(2); 
        } 
        else { 
          printf("Computed CRC: %u.\n",mComputedCRC32); 
          result = aSink->DidBuildModel(3); 
+          NS_ENSURE_SUCCESS(result, result);
        } 
      } 
-          else result = aSink->DidBuildModel(0); 
    }
-        else result=aSink->DidBuildModel(0); 
 #endif

    MOZ_TIMER_DEBUGLOG(("Start: Parse Time: CNavDTD::DidBuildModel(), this=%p\n", this));
@@ -658,18 +707,14 @@ nsresult CNavDTD::DidBuildModel(nsresult anErrorCode,PRBool aNotifySink,nsIParse
      //Now make sure the misplaced content list is empty,
      //by forcefully recycling any tokens we might find there.

-        CToken* theToken=0;
-        while((theToken=(CToken*)mMisplacedContent.Pop())) {
+    CToken* theToken = 0;
+    while ((theToken = (CToken*)mMisplacedContent.Pop())) {
      IF_FREE(theToken, mTokenAllocator);
    }
-      } 
  } //if aparser

  //No matter what, you need to call did build model.
-    result=aSink->DidBuildModel(0); 
-
-  } //if asink
-  return result;
+  return aSink->DidBuildModel(0); 
 }

 NS_IMETHODIMP_(void) 
--- a/parser/htmlparser/src/CNavDTD.h
+++ b/parser/htmlparser/src/CNavDTD.h
@@ -259,7 +259,8 @@ public:
    nsresult    HandleScriptToken(const nsIParserNode *aNode);
    nsresult    HandleProcessingInstructionToken(CToken* aToken);
    nsresult    HandleDocTypeDeclToken(CToken* aToken);
-
+    nsresult    BuildNeglectedTarget(eHTMLTags aTarget, eHTMLTokenTypes aType,
+                                     nsIParser* aParser, nsIContentSink* aSink);

    //*************************************************
    //these cover methods mimic the sink, and are used
--- a/parser/htmlparser/src/nsHTMLTokens.cpp
+++ b/parser/htmlparser/src/nsHTMLTokens.cpp
@@ -1734,8 +1734,8 @@ nsresult CAttributeToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 a
    { PRUnichar(' '), PRUnichar('"'), 
      PRUnichar('='), PRUnichar('\n'), 
      PRUnichar('\r'), PRUnichar('\t'), 
-      PRUnichar('>'), PRUnichar('\b'),
-      PRUnichar(0) };
+      PRUnichar('>'), PRUnichar('<'),
+      PRUnichar('\b'), PRUnichar(0) };
    static const nsReadEndCondition theEndCondition(theTerminalsChars);

    nsReadingIterator<PRUnichar> start, end;