tubestation/parser/htmlparser/src/nsHTMLTokens.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include <ctype.h>
#include <time.h>
#include <stdio.h>
#include "nsScanner.h"
#include "nsToken.h"
#include "nsIAtom.h"
#include "nsHTMLTokens.h"
#include "prtypes.h"
#include "nsDebug.h"
#include "nsHTMLTags.h"
#include "nsHTMLEntities.h"
#include "nsCRT.h"
#include "nsReadableUtils.h"
#include "nsUnicharUtils.h"
#include "nsScanner.h"


static const PRUnichar sUserdefined[] = {'u', 's', 'e', 'r', 'd', 'e', 'f',
                                         'i', 'n', 'e', 'd', 0};

static const PRUnichar kAttributeTerminalChars[] = {
  PRUnichar('&'), PRUnichar('\b'), PRUnichar('\t'),
  PRUnichar('\n'), PRUnichar('\r'), PRUnichar(' '),
  PRUnichar('>'),
  PRUnichar(0)
};


/**************************************************************
  And now for the token classes...
 **************************************************************/

/*
 *  constructor from tag id
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
CHTMLToken::CHTMLToken(eHTMLTags aTag) : CToken(aTag) {
}


CHTMLToken::~CHTMLToken() {

}

/*
 *  constructor from tag id
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
CStartToken::CStartToken(eHTMLTags aTag) : CHTMLToken(aTag) {
  mEmpty=PR_FALSE;
  mContainerInfo=eFormUnknown;
#ifdef DEBUG
  mAttributed = PR_FALSE;
#endif
}

CStartToken::CStartToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
  mEmpty=PR_FALSE;
  mContainerInfo=eFormUnknown;
  mTextValue.Assign(aName);
#ifdef DEBUG
  mAttributed = PR_FALSE;
#endif
}

CStartToken::CStartToken(const nsAString& aName,eHTMLTags aTag) : CHTMLToken(aTag) {
  mEmpty=PR_FALSE;
  mContainerInfo=eFormUnknown;
  mTextValue.Assign(aName);
#ifdef DEBUG
  mAttributed = PR_FALSE;
#endif
}

nsresult CStartToken::GetIDAttributeAtom(nsIAtom** aResult)
{
  NS_ENSURE_ARG_POINTER(aResult);
  *aResult = mIDAttributeAtom;
  NS_IF_ADDREF(*aResult);

  return NS_OK;
}


nsresult CStartToken::SetIDAttributeAtom(nsIAtom* aID)
{
  NS_ENSURE_ARG(aID);
  mIDAttributeAtom = aID;

  return NS_OK;
}


/*
 *  This method returns the typeid (the tag type) for this token.
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CStartToken::GetTypeID(){
  if(eHTMLTag_unknown==mTypeID) {
    mTypeID = nsHTMLTags::LookupTag(mTextValue);
  }
  return mTypeID;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CStartToken::GetClassName(void) {
  return "start";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CStartToken::GetTokenType(void) {
  return eToken_start;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
void CStartToken::SetEmpty(PRBool aValue) {
  mEmpty=aValue;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRBool CStartToken::IsEmpty(void) {
  return mEmpty;
}


/*
 *  Consume the identifier portion of the start tag
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
 *  @return  error result
 */
nsresult CStartToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {

  //if you're here, we've already Consumed the < char, and are
   //ready to Consume the rest of the open tag identifier.
   //Stop consuming as soon as you see a space or a '>'.
   //NOTE: We don't Consume the tag attributes here, nor do we eat the ">"

  nsresult result=NS_OK;
  if (aFlag & NS_IPARSER_FLAG_HTML) {
    nsAutoString theSubstr;
    result=aScanner.GetIdentifier(theSubstr,PR_TRUE);
    mTypeID = (PRInt32)nsHTMLTags::LookupTag(theSubstr);
    // Save the original tag string if this is user-defined or if we
    // are viewing source
    if(eHTMLTag_userdefined==mTypeID || (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
      mTextValue=theSubstr;
    }
  }
  else {
    //added PR_TRUE to readId() call below to fix bug 46083. The problem was that the tag given
    //was written <title_> but since we didn't respect the '_', we only saw <title>. Then
    //we searched for end title, which never comes (they give </title_>).

    result=aScanner.ReadIdentifier(mTextValue,PR_TRUE);
    mTypeID = nsHTMLTags::LookupTag(mTextValue);
  }

  if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
    result = aScanner.SkipWhitespace(mNewlineCount);
  }

  return result;
}


const nsAString& CStartToken::GetStringValue()
{
  if((eHTMLTag_unknown<mTypeID) && (mTypeID<eHTMLTag_text)) {
    if(!mTextValue.Length()) {
      mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
    }
  }
  return mTextValue;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param   anOutputString will recieve the result
 *  @return  nada
 */
void CStartToken::GetSource(nsString& anOutputString){
  anOutputString.Truncate();
  AppendSourceTo(anOutputString);
}

/*
 *
 *
 *  @update  harishd 03/23/00
 *  @param   result appended to the output string.
 *  @return  nada
 */
void CStartToken::AppendSourceTo(nsAString& anOutputString){
  anOutputString.Append(PRUnichar('<'));
  /*
   * Watch out for Bug 15204
   */
  if(!mTrailingContent.IsEmpty())
    anOutputString.Append(mTrailingContent);
  else {
    if(!mTextValue.IsEmpty())
      anOutputString.Append(mTextValue);
    else
     anOutputString.Append(GetTagName(mTypeID));
    anOutputString.Append(PRUnichar('>'));
  }
}

/*
 *  constructor from tag id
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
CEndToken::CEndToken(eHTMLTags aTag) : CHTMLToken(aTag) {
}

CEndToken::CEndToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
  mTextValue.Assign(aName);
}

CEndToken::CEndToken(const nsAString& aName,eHTMLTags aTag) : CHTMLToken(aTag) {
  mTextValue.Assign(aName);
}

/*
 *  Consume the identifier portion of the end tag
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
 *  @return  error result
 */
nsresult CEndToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag)
{
  nsresult result = NS_OK;
  if (aFlag & NS_IPARSER_FLAG_HTML) {
    nsAutoString theSubstr;
    result=aScanner.GetIdentifier(theSubstr,PR_TRUE);
    NS_ENSURE_SUCCESS(result, result);

    mTypeID = (PRInt32)nsHTMLTags::LookupTag(theSubstr);
    // Save the original tag string if this is user-defined or if we
    // are viewing source
    if(eHTMLTag_userdefined==mTypeID ||
       (aFlag & (NS_IPARSER_FLAG_VIEW_SOURCE | NS_IPARSER_FLAG_PRESERVE_CONTENT))) {
      mTextValue=theSubstr;
    }
  }
  else {
    result = aScanner.ReadIdentifier(mTextValue,PR_TRUE);
    NS_ENSURE_SUCCESS(result, result);

    mTypeID = nsHTMLTags::LookupTag(mTextValue);
  }

  if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
    result = aScanner.SkipWhitespace(mNewlineCount);
    NS_ENSURE_SUCCESS(result, result);
  }

  return result;
}


/*
 *  Asks the token to determine the <i>HTMLTag type</i> of
 *  the token. This turns around and looks up the tag name
 *  in the tag dictionary.
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return  eHTMLTag id of this endtag
 */
PRInt32 CEndToken::GetTypeID(){
  if(eHTMLTag_unknown==mTypeID) {
    mTypeID = nsHTMLTags::LookupTag(mTextValue);
    switch(mTypeID) {
      case eHTMLTag_dir:
      case eHTMLTag_menu:
        mTypeID=eHTMLTag_ul;
        break;
      default:
        break;
    }
  }
  return mTypeID;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CEndToken::GetClassName(void) {
  return "/end";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CEndToken::GetTokenType(void) {
  return eToken_end;
}

const nsAString& CEndToken::GetStringValue()
{
  if((eHTMLTag_unknown<mTypeID) && (mTypeID<eHTMLTag_text)) {
    if(!mTextValue.Length()) {
      mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
    }
  }
  return mTextValue;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param   anOutputString will recieve the result
 *  @return  nada
 */
void CEndToken::GetSource(nsString& anOutputString){
  anOutputString.Truncate();
  AppendSourceTo(anOutputString);
}

/*
 *
 *
 *  @update  harishd 03/23/00
 *  @param   result appended to the output string.
 *  @return  nada
 */
void CEndToken::AppendSourceTo(nsAString& anOutputString){
  anOutputString.Append(NS_LITERAL_STRING("</"));
  if(!mTextValue.IsEmpty())
    anOutputString.Append(mTextValue);
  else
    anOutputString.Append(GetTagName(mTypeID));
  anOutputString.Append(PRUnichar('>'));
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CTextToken::CTextToken() : CHTMLToken(eHTMLTag_text) {
}


/*
 *  string based constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CTextToken::CTextToken(const nsAString& aName) : CHTMLToken(eHTMLTag_text) {
  mTextValue.Rebind(aName);
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CTextToken::GetClassName(void) {
  return "text";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CTextToken::GetTokenType(void) {
  return eToken_text;
}

PRInt32 CTextToken::GetTextLength(void) {
  return mTextValue.Length();
}

/*
 *  Consume as much clear text from scanner as possible.
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CTextToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
  static const PRUnichar theTerminalsChars[] =
    { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('&'), PRUnichar('<'),
      PRUnichar(0) };
  static const nsReadEndCondition theEndCondition(theTerminalsChars);
  nsresult  result=NS_OK;
  PRBool    done=PR_FALSE;
  nsScannerIterator origin, start, end;

  // Start scanning after the first character, because we know it to
  // be part of this text token (we wouldn't have come here if it weren't)
  aScanner.CurrentPosition(origin);
  start = origin;
  ++start;
  aScanner.SetPosition(start);
  aScanner.EndReading(end);

  while((NS_OK==result) && (!done)) {
    result=aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
    if(NS_OK==result) {
      result=aScanner.Peek(aChar);

      if(((kCR==aChar) || (kNewLine==aChar)) && (NS_OK==result)) {
        result=aScanner.GetChar(aChar); //strip off the char
        PRUnichar theNextChar;
        result=aScanner.Peek(theNextChar);    //then see what's next.
        switch(aChar) {
          case kCR:
            // result=aScanner.GetChar(aChar);
            if(kLF==theNextChar) {
              // If the "\r" is followed by a "\n", don't replace it and
              // let it be ignored by the layout system
              end.advance(2);
              result=aScanner.GetChar(theNextChar);
            }
            else {
              // If it standalone, replace the "\r" with a "\n" so that
              // it will be considered by the layout system
              aScanner.ReplaceCharacter(end, kLF);
              ++end;
            }
            ++mNewlineCount;
            break;
          case kLF:
            ++end;
            ++mNewlineCount;
            break;
        } //switch
      }
      else done=PR_TRUE;
    }
  }

  aScanner.BindSubstring(mTextValue, origin, end);

  return result;
}

/*
 *  Consume as much clear text from scanner as possible.
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CTextToken::ConsumeUntil(PRUnichar aChar,PRBool aIgnoreComments,nsScanner& aScanner,
                                  nsString& aEndTagName,PRInt32 aFlag,PRBool& aFlushTokens){
  nsresult      result=NS_OK;
  nsScannerIterator theStartOffset, theCurrOffset, theTermStrPos, theStartCommentPos, theAltTermStrPos, endPos;
  PRBool        done=PR_FALSE;
  PRBool        theLastIteration=PR_FALSE;

  aScanner.CurrentPosition(theStartOffset);
  theCurrOffset = theStartOffset;
  aScanner.EndReading(endPos);
  theTermStrPos = theStartCommentPos = theAltTermStrPos = endPos;

  // ALGORITHM: *** The performance is based on correctness of the document ***
  // 1. Look for a '<' character.  This could be
  //    a) Start of a comment (<!--), b) Start of the terminal string, or c) a start of a tag.
  //    We are interested in a) and b). c) is ignored because in CDATA we don't care for tags.
  //    NOTE: Technically speaking in CDATA we should ignore the comments too!! But for compatibility
  //          we don't.
  // 2. Having the offset, for '<', search for the terminal string from there on and record its offset.
  // 3. From the same '<' offset also search for start of a comment '<!--'. If found search for
  //    end comment '-->' between the terminal string and '<!--'.  If you did not find the end
  //    comment, then we have a malformed document, i.e., this section has a prematured terminal string
  //    Ex. <SCRIPT><!-- document.write('</SCRIPT>') //--> </SCRIPT>. But anyway record terminal string's
  //    offset and update the current offset to the terminal string (prematured) offset and goto step 1.
  // 4. Amen...If you found a terminal string and '-->'. Otherwise goto step 1.
  // 5. If the end of the document is reached and if we still don't have the condition in step 4. then
  //    assume that the prematured terminal string is the actual terminal string and goto step 1. This
  //    will be our last iteration.

  const NS_NAMED_LITERAL_STRING(ltslash, "</");
  const nsString theTerminalString = ltslash + aEndTagName;

  PRUint32 termStrLen=theTerminalString.Length();
  while((result == NS_OK) && !done) {
    PRBool found = PR_FALSE;
    nsScannerIterator gtOffset,ltOffset = theCurrOffset;
    while (FindCharInReadable(PRUnichar(kLessThan), ltOffset, endPos) &&
           ((PRUint32)ltOffset.size_forward() >= termStrLen ||
            Distance(ltOffset, endPos) >= termStrLen)) {
      // Make a copy of the (presumed) end tag and
      // do a case-insensitive comparison

      nsScannerIterator start(ltOffset), end(ltOffset);
      end.advance(termStrLen);

      if (CaseInsensitiveFindInReadable(theTerminalString,start,end) &&
          end != endPos && (*end == '>'  || *end == ' '  ||
                            *end == '\t' || *end == '\n' ||
                            *end == '\r' || *end == '\b')) {
        gtOffset = end;
        if (FindCharInReadable(PRUnichar(kGreaterThan), gtOffset, endPos)) {
          found = PR_TRUE;
          theTermStrPos = start;
        }
        break;
      }
      ltOffset.advance(1);
    }

    if (found && theTermStrPos != endPos) {
      if(!(aFlag & NS_IPARSER_FLAG_STRICT_MODE) &&
         !theLastIteration && !aIgnoreComments) {
        nsScannerIterator endComment(ltOffset);
        endComment.advance(5);

        if ((theStartCommentPos == endPos) &&
            FindInReadable(NS_LITERAL_STRING("<!--"), theCurrOffset, endComment)) {
          theStartCommentPos = theCurrOffset;
        }

        if (theStartCommentPos != endPos) {
          // Search for --> between <!-- and </TERMINALSTRING>.
          theCurrOffset = theStartCommentPos;
          nsScannerIterator terminal(theTermStrPos);
          if (!RFindInReadable(NS_LITERAL_STRING("-->"),
                               theCurrOffset, terminal)) {
            // If you're here it means that we have a bogus terminal string.
            // Even though it is bogus, the position of the terminal string
            // could be helpful in case we hit the rock bottom.
            theAltTermStrPos = theTermStrPos;

            // We did not find '-->' so keep searching for terminal string.
            theCurrOffset = theTermStrPos;
            theCurrOffset.advance(termStrLen);
            continue;
          }
        }
      }

      // Make sure to preserve the end tag's representation if needed
      if(aFlag & (NS_IPARSER_FLAG_VIEW_SOURCE | NS_IPARSER_FLAG_PRESERVE_CONTENT)) {
        CopyUnicodeTo(ltOffset.advance(2),gtOffset,aEndTagName);
      }

      aScanner.BindSubstring(mTextValue, theStartOffset, theTermStrPos);
      aScanner.SetPosition(gtOffset.advance(1));

      // We found </SCRIPT>...permit flushing -> Ref: Bug 22485
      aFlushTokens=PR_TRUE;
      done = PR_TRUE;
    }
    else {
      // We end up here if:
      // a) when the buffer runs out ot data.
      // b) when the terminal string is not found.
      if(!aScanner.IsIncremental()) {
        if(theAltTermStrPos != endPos) {
          // If you're here it means..we hit the rock bottom and therefore switch to plan B.
          theCurrOffset = theAltTermStrPos;
          theLastIteration = PR_TRUE;
        }
        else {
          done = PR_TRUE; // Do this to fix Bug. 35456
        }
      }
      else {
       result=kEOF;
      }
    }
  }
  return result;
}

void CTextToken::CopyTo(nsAString& aStr)
{
  nsScannerIterator start, end;
  mTextValue.BeginReading(start);
  mTextValue.EndReading(end);
  CopyUnicodeTo(start, end, aStr);
}

const nsAString& CTextToken::GetStringValue(void)
{
  return mTextValue.AsString();
}

void CTextToken::Bind(nsScanner* aScanner, nsScannerIterator& aStart, nsScannerIterator& aEnd)
{
  aScanner->BindSubstring(mTextValue, aStart, aEnd);
}

void CTextToken::Bind(const nsAString& aStr)
{
  mTextValue.Rebind(aStr);
}

/*
 *  default constructor
 *
 *  @update  vidur 11/12/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CCDATASectionToken::CCDATASectionToken(eHTMLTags aTag) : CHTMLToken(aTag) {
}


/*
 *  string based constructor
 *
 *  @update  vidur 11/12/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CCDATASectionToken::CCDATASectionToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
  mTextValue.Assign(aName);
}

/*
 *
 *
 *  @update  vidur 11/12/98
 *  @param
 *  @return
 */
const char*  CCDATASectionToken::GetClassName(void) {
  return "cdatasection";
}

/*
 *
 *  @update  vidur 11/12/98
 *  @param
 *  @return
 */
PRInt32 CCDATASectionToken::GetTokenType(void) {
  return eToken_cdatasection;
}

/*
 *  Consume as much marked test from scanner as possible.
 *
 *  @update  rgess 12/15/99: had to handle case: "<![ ! IE 5]>", in addition to "<![..[..]]>".
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CCDATASectionToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
  static const PRUnichar theTerminalsChars[] =
  { PRUnichar('\r'), PRUnichar('\n'), PRUnichar(']'), PRUnichar(0) };
  static const nsReadEndCondition theEndCondition(theTerminalsChars);
  nsresult  result=NS_OK;
  PRBool    done=PR_FALSE;

  while((NS_OK==result) && (!done)) {
    result=aScanner.ReadUntil(mTextValue,theEndCondition,PR_FALSE);
    if(NS_OK==result) {
      result=aScanner.Peek(aChar);
      if((kCR==aChar) && (NS_OK==result)) {
        result=aScanner.GetChar(aChar); //strip off the \r
        result=aScanner.Peek(aChar);    //then see what's next.
        if(NS_OK==result) {
          switch(aChar) {
            case kCR:
              result=aScanner.GetChar(aChar); //strip off the \r
              mTextValue.Append(NS_LITERAL_STRING("\n\n"));
              mNewlineCount += 2;
              break;
            case kNewLine:
               //which means we saw \r\n, which becomes \n
              result=aScanner.GetChar(aChar); //strip off the \n
                  //now fall through on purpose...
            default:
              mTextValue.Append(NS_LITERAL_STRING("\n"));
              mNewlineCount++;
              break;
          } //switch
        } //if
      }
      else if (kNewLine == aChar) {
        result=aScanner.GetChar(aChar);
        mTextValue.Append(aChar);
        ++mNewlineCount;
      }
      else if (kRightSquareBracket == aChar) {
        result=aScanner.GetChar(aChar); //strip off the ]
        mTextValue.Append(aChar);
        result=aScanner.Peek(aChar);    //then see what's next.
        if((NS_OK==result) && (kRightSquareBracket==aChar)) {
          result=aScanner.GetChar(aChar); //strip off the second ]
          mTextValue.Append(aChar);
        }
        // The goal here is to not lose data from the page when encountering
        // markup like: <![endif]-->.  This means that in normal parsing, we
        // allow ']' to end the marked section and just drop everything between
        // it an the '>'.  In view-source mode, we cannot drop things on the
        // floor like that.  In fact, to make view-source of XML with script in
        // CDATA sections at all bearable, we need to somewhat enforce the ']>'
        // terminator for marked sections.  So make the tokenization somewhat
        // different when in view-source _and_ dealing with a CDATA section.
        PRBool inCDATA = (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) &&
          StringBeginsWith(mTextValue, NS_LITERAL_STRING("[CDATA["));
        if (inCDATA) {
          result = aScanner.Peek(aChar);
        } else {
          nsAutoString dummy; // skip any bad data
          result=aScanner.ReadUntil(dummy,kGreaterThan,PR_FALSE);
        }
        if (NS_OK==result &&
            (!inCDATA || kGreaterThan == aChar)) {
          result=aScanner.GetChar(aChar); //strip off the >
          done=PR_TRUE;
        }
      }
      else done=PR_TRUE;
    }
  }
  return result;
}

const nsAString& CCDATASectionToken::GetStringValue(void)
{
  return mTextValue;
}


/*
 *  default constructor
 *
 *  @param   aName -- string to init token name with
 *  @return
 */
CMarkupDeclToken::CMarkupDeclToken() : CHTMLToken(eHTMLTag_markupDecl) {
}


/*
 *  string based constructor
 *
 *  @param   aName -- string to init token name with
 *  @return
 */
CMarkupDeclToken::CMarkupDeclToken(const nsAString& aName) : CHTMLToken(eHTMLTag_markupDecl) {
  mTextValue.Rebind(aName);
}

/*
 *
 *
 *  @param
 *  @return
 */
const char*  CMarkupDeclToken::GetClassName(void) {
  return "markupdeclaration";
}

/*
 *
 *  @param
 *  @return
 */
PRInt32 CMarkupDeclToken::GetTokenType(void) {
  return eToken_markupDecl;
}

/*
 *  Consume as much declaration from scanner as possible.
 *  Declaration is a markup declaration of ELEMENT, ATTLIST, ENTITY or
 *  NOTATION, which can span multiple lines and ends in >.
 *
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CMarkupDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
  static const PRUnichar theTerminalsChars[] =
    { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('\''), PRUnichar('"'),
      PRUnichar('>'),
      PRUnichar(0) };
  static const nsReadEndCondition theEndCondition(theTerminalsChars);
  nsresult  result=NS_OK;
  PRBool    done=PR_FALSE;
  PRUnichar quote=0;

  nsScannerIterator origin, start, end;
  aScanner.CurrentPosition(origin);
  start = origin;

  while((NS_OK==result) && (!done)) {
    aScanner.SetPosition(start);
    result=aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
    if(NS_OK==result) {
      result=aScanner.Peek(aChar);

      if(NS_OK==result) {
        PRUnichar theNextChar=0;
        if ((kCR==aChar) || (kNewLine==aChar)) {
          result=aScanner.GetChar(aChar); //strip off the char
          result=aScanner.Peek(theNextChar);    //then see what's next.
        }
        switch(aChar) {
          case kCR:
            // result=aScanner.GetChar(aChar);
            if(kLF==theNextChar) {
              // If the "\r" is followed by a "\n", don't replace it and
              // let it be ignored by the layout system
              end.advance(2);
              result=aScanner.GetChar(theNextChar);
            }
            else {
              // If it standalone, replace the "\r" with a "\n" so that
              // it will be considered by the layout system
              aScanner.ReplaceCharacter(end, kLF);
              ++end;
            }
            ++mNewlineCount;
            break;
          case kLF:
            ++end;
            ++mNewlineCount;
            break;
          case '\'':
          case '"':
            ++end;
            if (quote) {
              if (quote == aChar) {
                quote = 0;
              }
            } else {
              quote = aChar;
            }
            break;
          case kGreaterThan:
            if (quote) {
              ++end;
            } else {
              start = end;
              ++start;  // Note that start is wrong after this, we just avoid temp var
              aScanner.SetPosition(start); // Skip the >
              done=PR_TRUE;
            }
            break;
          default:
            NS_ABORT_IF_FALSE(0,"should not happen, switch is missing cases?");
            break;
        } //switch
        start = end;
      }
      else done=PR_TRUE;
    } // if read until !ok
  } // while

  aScanner.BindSubstring(mTextValue, origin, end);

  return result;
}

const nsAString& CMarkupDeclToken::GetStringValue(void)
{
  return mTextValue.AsString();
}


/*
 *  Default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CCommentToken::CCommentToken() : CHTMLToken(eHTMLTag_comment) {
}


/*
 *  Copy constructor
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
CCommentToken::CCommentToken(const nsAString& aName) : CHTMLToken(eHTMLTag_comment) {
  mComment.Rebind(aName);
}

void CCommentToken::AppendSourceTo(nsAString& anOutputString){
  AppendUnicodeTo(mCommentDecl, anOutputString);
}

static PRBool IsCommentEnd(
  const nsScannerIterator& aCurrent,
  const nsScannerIterator& aEnd,
  nsScannerIterator& aGt)
{
  nsScannerIterator current = aCurrent;
  PRInt32 dashes = 0;

  while ((current != aEnd) && (dashes != 2)) {
    if (*current == kGreaterThan) {
      aGt = current;
      return PR_TRUE;
    }
    if (*current == PRUnichar('-')) {
      ++dashes;
    } else {
      dashes = 0;
    }
    ++current;
  }

  return PR_FALSE;
}

nsresult CCommentToken::ConsumeStrictComment(nsScanner& aScanner)
{
  // <!--[... -- ... -- ...]*-->
  /*********************************************************
    NOTE: This algorithm does a fine job of handling comments
          when they're formatted per spec, but if they're not
          we don't handle them well.
   *********************************************************/
  nsScannerIterator end, current, gt, lt;
  aScanner.EndReading(end);
  aScanner.CurrentPosition(current);

  nsScannerIterator beginData = end;

  lt = current;
  lt.advance(-2); // <!

  // Regular comment must start with <!--
  if (current != end && *current == kMinus &&
      ++current != end && *current == kMinus &&
      ++current != end) {
    nsScannerIterator currentEnd = end;
    PRBool balancedComment = PR_FALSE;
    static NS_NAMED_LITERAL_STRING(dashes,"--");
    beginData = current;

    while (FindInReadable(dashes, current, currentEnd)) {
      current.advance(2);

      balancedComment = !balancedComment; // We need to match '--' with '--'

      if (balancedComment && IsCommentEnd(current, end, gt)) {
        // done
        current.advance(-2);
        if (beginData != current) { // protects from <!---->
          aScanner.BindSubstring(mComment, beginData, current);
        }
        aScanner.BindSubstring(mCommentDecl, lt, ++gt);
        aScanner.SetPosition(gt);
        return NS_OK;
      } else {
        // Continue after the last '--'
        currentEnd = end;
      }
    }
  }

  // If beginData == end, we did not find opening '--'
  if (beginData == end) {
    // This might have been empty comment: <!>
    // Or it could have been something completely bogus like: <!This is foobar>
    // Handle both cases below
    aScanner.CurrentPosition(current);
    beginData = current;
    if (FindCharInReadable('>', current, end)) {
      aScanner.BindSubstring(mComment, beginData, current);
      aScanner.BindSubstring(mCommentDecl, lt, ++current);
      aScanner.SetPosition(current);
      return NS_OK;
    }
  }

  if (aScanner.IsIncremental()) {
    // We got here because we saw the beginning of a comment,
    // but not yet the end, and we are still loading the page. In that
    // case the return value here will cause us to unwind,
    // wait for more content, and try again.
    // XXX For performance reasons we should cache where we were, and
    //     continue from there for next call
    return kEOF; // not really an nsresult, but...
  }

  // XXX We should return kNotAComment, parse comment open as text, and parse
  //     the rest of the document normally. Now we ALMOST do that: <! is
  //     missing from the content model.
  return NS_OK;
}

nsresult CCommentToken::ConsumeQuirksComment(nsScanner& aScanner)
{
  // <![-[-]] ... [[-]-|--!]>
  /*********************************************************
    NOTE: This algorithm does a fine job of handling comments
          commonly used, but it doesn't really consume them
          per spec (But then, neither does IE or Nav).
   *********************************************************/
  nsScannerIterator end, current;
  aScanner.EndReading(end);
  aScanner.CurrentPosition(current);
  nsScannerIterator beginData = current,
                    beginLastMinus = end,
                    bestAltCommentEnd = end,
                    lt = current;
  lt.advance(-2); // <!

  // When we get here, we have always already consumed <!
  // Skip over possible leading minuses
  if (current != end && *current == kMinus) {
    beginLastMinus = current;
    ++current;
    ++beginData;
    if (current != end && *current == kMinus) { // <!--
      beginLastMinus = current;
      ++current;
      ++beginData;
      // Long form comment

      nsScannerIterator currentEnd = end, gt = end;

      // Find the end of the comment
      while (FindCharInReadable(kGreaterThan, current, currentEnd)) {
        gt = current;
        if (bestAltCommentEnd == end) {
          bestAltCommentEnd = gt;
        }
        --current;
        PRBool goodComment = PR_FALSE;
        if (current != beginLastMinus && *current == kMinus) { // ->
          --current;
          if (current != beginLastMinus && *current == kMinus) { // -->
            goodComment = PR_TRUE;
            --current;
          }
        } else if (current != beginLastMinus && *current == '!') {
          --current;
          if (current != beginLastMinus && *current == kMinus) {
            --current;
            if (current != beginLastMinus && *current == kMinus) { // --!>
              --current;
              goodComment = PR_TRUE;
            }
          }
        } else if (current == beginLastMinus) {
          goodComment = PR_TRUE;
        }

        if (goodComment) {
          // done
          if (beginLastMinus != current) { // protects from <!---->
            aScanner.BindSubstring(mComment, beginData, ++current);
          }
          aScanner.BindSubstring(mCommentDecl, lt, ++gt);
          aScanner.SetPosition(gt);
          return NS_OK;
        } else {
          // try again starting after the last '>'
          current = ++gt;
          currentEnd = end;
        }
      } //while

      if (aScanner.IsIncremental()) {
        // We got here because we saw the beginning of a comment,
        // but not yet the end, and we are still loading the page. In that
        // case the return value here will cause us to unwind,
        // wait for more content, and try again.
        // XXX For performance reasons we should cache where we were, and
        //     continue from there for next call
        return kEOF;  // not really an nsresult, but...
      }

      // If you're here, then we're in a special state.
      // The problem at hand is that we've hit the end of the document without finding the normal endcomment delimiter "-->".
      // In this case, the first thing we try is to see if we found an alternate endcomment delimiter ">".
      // If so, rewind just pass that, and use everything up to that point as your comment.
      // If not, the document has no end comment and should be treated as one big comment.
      gt = bestAltCommentEnd;
      if (beginData != gt) { // protects from <!-->
        aScanner.BindSubstring(mComment, beginData, gt);
      }
      if (gt != end) {
        ++gt;
      }
      aScanner.BindSubstring(mCommentDecl, lt, gt);
      aScanner.SetPosition(gt);
      return NS_OK;
    }
  }

  // This could be short form of comment
  // Find the end of the comment
  current = beginData;
  if (FindCharInReadable(kGreaterThan, current, end)) {
    nsScannerIterator gt = current;
    if (current != beginData) {
      --current;
      if (current != beginData && *current == kMinus) { // ->
        --current;
        if (current != beginData && *current == kMinus) { // -->
          --current;
        }
      } else if (current != beginData && *current == '!') { // !>
        --current;
        if (current != beginData && *current == kMinus) { // -!>
          --current;
          if (current != beginData && *current == kMinus) { // --!>
            --current;
          }
        }
      }
    }

    if (current != gt) {
      aScanner.BindSubstring(mComment, beginData, ++current);
    }
    aScanner.BindSubstring(mCommentDecl, lt, ++gt);
    aScanner.SetPosition(gt);
    return NS_OK;
  }

  return kEOF; // not really an nsresult, but...
}

/*
 *  Consume the identifier portion of the comment.
 *  Note that we've already eaten the "<!" portion.
 *
 *  @update  gess 16June2000
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CCommentToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
  nsresult result=PR_TRUE;

  if (aFlag & NS_IPARSER_FLAG_STRICT_MODE) {
    //Enabling strict comment parsing for Bug 53011 and  2749 contradicts!!!!
    result = ConsumeStrictComment(aScanner);
  }
  else {
    result = ConsumeQuirksComment(aScanner);
  }

  if (NS_SUCCEEDED(result)) {
    mNewlineCount = !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) ? mCommentDecl.CountChar(kNewLine) : -1;
  }

  return result;
}

const nsAString& CCommentToken::GetStringValue(void)
{
  return mComment.AsString();
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char* CCommentToken::GetClassName(void){
  return "/**/";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CCommentToken::GetTokenType(void) {
  return eToken_comment;
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CNewlineToken::CNewlineToken() : CHTMLToken(eHTMLTag_newline) {
}


/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CNewlineToken::GetClassName(void) {
  return "crlf";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CNewlineToken::GetTokenType(void) {
  return eToken_newline;
}


static nsScannerSubstring* gNewlineStr;
void CNewlineToken::AllocNewline()
{
  gNewlineStr = new nsScannerSubstring(NS_LITERAL_STRING("\n"));
}

void CNewlineToken::FreeNewline()
{
  if (gNewlineStr) {
    delete gNewlineStr;
    gNewlineStr = nsnull;
  }
}

/**
 *  This method retrieves the value of this internal string.
 *
 *  @update gess 3/25/98
 *  @return nsString reference to internal string value
 */
const nsAString& CNewlineToken::GetStringValue(void) {
  return gNewlineStr->AsString();
}

/*
 *  Consume as many cr/lf pairs as you can find.
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CNewlineToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {

/*******************************************************************

  Here's what the HTML spec says about newlines:

  "A line break is defined to be a carriage return (&#x000D;),
   a line feed (&#x000A;), or a carriage return/line feed pair.
   All line breaks constitute white space."

 *******************************************************************/

  PRUnichar theChar;
  nsresult result=aScanner.Peek(theChar);

  if(NS_OK==result) {
    switch(aChar) {
      case kNewLine:
        if(kCR==theChar) {
          result=aScanner.GetChar(theChar);
        }
        break;
      case kCR:
          //convert CRLF into just CR
        if(kNewLine==theChar) {
          result=aScanner.GetChar(theChar);
        }
        break;
      default:
        break;
    }
  }

  mNewlineCount = 1;
  return result;
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CAttributeToken::CAttributeToken() : CHTMLToken(eHTMLTag_unknown) {
  mHasEqualWithoutValue=PR_FALSE;
#ifdef DEBUG
  mLastAttribute = PR_FALSE;
#endif
}

/*
 *  string based constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string value to init token name with
 *  @return
 */
CAttributeToken::CAttributeToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
  mTextValue.Assign(aName);
  mHasEqualWithoutValue=PR_FALSE;
#ifdef DEBUG
  mLastAttribute = PR_FALSE;
#endif
}

/*
 *  construct initializing data to
 *  key value pair
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string value to init token name with
 *  @return
 */
CAttributeToken::CAttributeToken(const nsAString& aKey, const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
  mTextValue.Assign(aName);
  mTextKey.Rebind(aKey);
  mHasEqualWithoutValue=PR_FALSE;
#ifdef DEBUG
  mLastAttribute = PR_FALSE;
#endif
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CAttributeToken::GetClassName(void) {
  return "attr";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CAttributeToken::GetTokenType(void) {
  return eToken_attribute;
}

/*
 *  Removes non-alpha-non-digit characters from the end of a KEY
 *
 *  @update harishd 07/15/99
 *  @param
 *  @return
 */
void CAttributeToken::SanitizeKey() {
  PRInt32   length=mTextKey.Length();
  if(length > 0) {
    nsScannerIterator iter, begin, end;
    mTextKey.BeginReading(begin);
    mTextKey.EndReading(end);
    iter = end;

    // Look for the first legal character starting from
    // the end of the string
    do {
      --iter;
    } while (!nsCRT::IsAsciiAlpha(*iter) &&
             !nsCRT::IsAsciiDigit(*iter) &&
             (iter != begin));

    // If there were any illegal characters, just copy out the
    // legal part
    if (iter != --end) {
      nsAutoString buf;
      CopyUnicodeTo(begin, ++iter, buf);
      mTextKey.Rebind(buf);
    }
  }

  return;
}

const nsAString& CAttributeToken::GetKey(void)
{
  return mTextKey.AsString();
}

const nsAString& CAttributeToken::GetStringValue(void)
{
  return mTextValue;
}

/*
 *
 *
 *  @update  rickg  6June2000
 *  @param   anOutputString will recieve the result
 *  @return  nada
 */
void CAttributeToken::GetSource(nsString& anOutputString){
  anOutputString.Truncate();
  AppendSourceTo(anOutputString);
}

/*
 *
 *
 *  @update  rickg  6June2000
 *  @param   result appended to the output string.
 *  @return  nada
 */
void CAttributeToken::AppendSourceTo(nsAString& anOutputString){
  AppendUnicodeTo(mTextKey, anOutputString);
  if(mTextValue.Length() || mHasEqualWithoutValue)
    anOutputString.Append(NS_LITERAL_STRING("="));
  anOutputString.Append(mTextValue);
  // anOutputString.Append(NS_LITERAL_STRING(";"));
}

static void AppendNCR(nsString& aString, PRInt32 aNCRValue);
/*
 *  @param   aScanner -- controller of underlying input source
 *  @param   aFlag -- If NS_IPARSER_FLAG_VIEW_SOURCE do not reduce entities...
 *  @return  error result
 *
 */
static
nsresult ConsumeAttributeEntity(nsString& aString,
                                nsScanner& aScanner,
                                PRInt32 aFlag)
{

  nsresult result=NS_OK;

  PRUnichar ch;
  result=aScanner.Peek(ch, 1);

  if (NS_SUCCEEDED(result)) {
    PRUnichar amp=0;
    PRInt32 theNCRValue=0;
    nsAutoString entity;

    if (nsCRT::IsAsciiAlpha(ch) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
      result=CEntityToken::ConsumeEntity(ch,entity,aScanner);
      if (NS_SUCCEEDED(result)) {
        theNCRValue = nsHTMLEntities::EntityToUnicode(entity);
        PRUnichar theTermChar=entity.Last();
        // If an entity value is greater than 255 then:
        // Nav 4.x does not treat it as an entity,
        // IE treats it as an entity if terminated with a semicolon.
        // Resembling IE!!
        if(theNCRValue < 0 || (theNCRValue > 255 && theTermChar != ';')) {
          // Looks like we're not dealing with an entity
          aString.Append(kAmpersand);
          aString.Append(entity);
        }
        else {
          // A valid entity so reduce it.
          aString.Append(PRUnichar(theNCRValue));
        }
      }
    }
    else if (ch==kHashsign && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
      result=CEntityToken::ConsumeEntity(ch,entity,aScanner);
      if (NS_SUCCEEDED(result)) {
        if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
          // Looked like an entity but it's not
          aScanner.GetChar(amp);
          aString.Append(amp);
          result = NS_OK; // just being safe..
        }
        else {
          PRInt32 err;
          theNCRValue=entity.ToInteger(&err,kAutoDetect);
          AppendNCR(aString, theNCRValue);
        }
      }
    }
    else {
      // What we thought as entity is not really an entity...
      aScanner.GetChar(amp);
      aString.Append(amp);
    }//if
  }

  return result;
}

/*
 *  This general purpose method is used when you want to
 *  consume attributed text value.
 *  Note: It also reduces entities within attributes.
 *
 *  @param   aNewlineCount -- the newline count to increment when hitting newlines
 *  @param   aScanner -- controller of underlying input source
 *  @param   aTerminalChars -- characters that stop consuming attribute.
 *  @param   aAllowNewlines -- whether to allow newlines in the value.
 *                             XXX it would be nice to roll this info into
 *                             aTerminalChars somehow....
 *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
 *  @return  error result
 */
static
nsresult ConsumeAttributeValueText(nsString& aString,
                                   PRInt32& aNewlineCount,
                                   nsScanner& aScanner,
                                   const nsReadEndCondition& aEndCondition,
                                   PRBool aAllowNewlines,
                                   PRInt32 aFlag)
{
  nsresult result = NS_OK;
  PRBool   done = PR_FALSE;

  do {
    result = aScanner.ReadUntil(aString,aEndCondition,PR_FALSE);
    if(NS_SUCCEEDED(result)) {
      PRUnichar ch;
      aScanner.Peek(ch);
      if(ch == kAmpersand) {
        result = ConsumeAttributeEntity(aString,aScanner,aFlag);
      }
      else if(ch == kCR && aAllowNewlines) {
        aScanner.GetChar(ch);
        result = aScanner.Peek(ch);
        if (NS_SUCCEEDED(result)) {
          if(ch == kNewLine) {
            aString.Append(NS_LITERAL_STRING("\r\n"));
            aScanner.GetChar(ch);
          }
          else {
            aString.Append(PRUnichar('\r'));
          }
          ++aNewlineCount;
        }
      }
      else if(ch == kNewLine && aAllowNewlines) {
        aScanner.GetChar(ch);
        aString.Append(PRUnichar('\n'));
        ++aNewlineCount;
      }
      else {
        done = PR_TRUE;
      }
    }
  } while (NS_SUCCEEDED(result) && !done);

  return result;
}

/*
 *  This general purpose method is used when you want to
 *  consume a known quoted string.
 *
 *  @param   aScanner -- controller of underlying input source
 *  @param   aTerminalChars -- characters that stop consuming attribute.
 *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
 *  @return  error result
 */
static
nsresult ConsumeQuotedString(PRUnichar aChar,
                             nsString& aString,
                             PRInt32& aNewlineCount,
                             nsScanner& aScanner,
                             PRInt32 aFlag)
{
  NS_ASSERTION(aChar==kQuote || aChar==kApostrophe,"char is neither quote nor apostrophe");

  static const PRUnichar theTerminalCharsQuote[] = {
    PRUnichar(kQuote), PRUnichar('&'), PRUnichar(kCR),
    PRUnichar(kNewLine), PRUnichar(0) };
  static const PRUnichar theTerminalCharsApostrophe[] = {
    PRUnichar(kApostrophe), PRUnichar('&'), PRUnichar(kCR),
    PRUnichar(kNewLine), PRUnichar(0) };
  static const nsReadEndCondition
    theTerminateConditionQuote(theTerminalCharsQuote);
  static const nsReadEndCondition
    theTerminateConditionApostrophe(theTerminalCharsApostrophe);

  // Assume Quote to init to something
  const nsReadEndCondition *terminateCondition = &theTerminateConditionQuote;
  if (aChar==kApostrophe)
    terminateCondition = &theTerminateConditionApostrophe;

  nsresult result=NS_OK;
  nsScannerIterator theOffset;
  aScanner.CurrentPosition(theOffset);

  result=ConsumeAttributeValueText(aString,aNewlineCount,aScanner,
                                   *terminateCondition,PR_TRUE,aFlag);

  if(NS_SUCCEEDED(result)) {
    result = aScanner.SkipOver(aChar); // aChar should be " or '
  }

  // Ref: Bug 35806
  // A back up measure when disaster strikes...
  // Ex <table> <tr d="><td>hello</td></tr></table>
  if(!aString.IsEmpty() && aString.Last()!=aChar &&
     !aScanner.IsIncremental() && result==kEOF) {
    static const nsReadEndCondition
      theAttributeTerminator(kAttributeTerminalChars);
    aString.Truncate();
    aScanner.SetPosition(theOffset, PR_FALSE, PR_TRUE);
    result=ConsumeAttributeValueText(aString,aNewlineCount,aScanner,
                                     theAttributeTerminator,PR_FALSE,aFlag);
  }
  return result;
}

/*
 *  Consume the key and value portions of the attribute.
 *
 *  @update  rickg 03.23.2000
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
 *  @return  error result
 */
nsresult CAttributeToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {

  nsresult result;

  //I changed a bit of this method to use aRetain so that we do the right
  //thing in viewsource. The ws/cr/lf sequences are now maintained, and viewsource looks good.

  nsScannerIterator wsstart, wsend;

  if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
    result = aScanner.ReadWhitespace(wsstart, wsend, mNewlineCount);
  }
  else {
    result = aScanner.SkipWhitespace(mNewlineCount);
  }

  if (NS_OK==result) {
    static const PRUnichar theTerminalsChars[] =
    { PRUnichar(' '), PRUnichar('"'),
      PRUnichar('='), PRUnichar('\n'),
      PRUnichar('\r'), PRUnichar('\t'),
      PRUnichar('>'), PRUnichar('<'),
      PRUnichar('\b'), PRUnichar(0) };
    static const nsReadEndCondition theEndCondition(theTerminalsChars);

    nsScannerIterator start, end;
    result=aScanner.ReadUntil(start,end,theEndCondition,PR_FALSE);

    if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
      aScanner.BindSubstring(mTextKey, start, end);
    }

    //now it's time to Consume the (optional) value...
    if (NS_OK==result) {
      if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
        result = aScanner.ReadWhitespace(start, wsend, mNewlineCount);
        aScanner.BindSubstring(mTextKey, wsstart, wsend);
      }
      else {
        result = aScanner.SkipWhitespace(mNewlineCount);
      }

      if (NS_OK==result) {
        result=aScanner.Peek(aChar);       //Skip ahead until you find an equal sign or a '>'...
        if (NS_OK==result) {
          if (kEqual==aChar){
            result=aScanner.GetChar(aChar);  //skip the equal sign...
            if (NS_OK==result) {
              if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
                result = aScanner.ReadWhitespace(mTextValue, mNewlineCount);
              }
              else {
                result = aScanner.SkipWhitespace(mNewlineCount);
              }

              if (NS_OK==result) {
                result=aScanner.Peek(aChar);  //and grab the next char.
                if (NS_OK==result) {
                  if ((kQuote==aChar) || (kApostrophe==aChar)) {
                    aScanner.GetChar(aChar);
                    result=ConsumeQuotedString(aChar,mTextValue,mNewlineCount,
                                               aScanner,aFlag);
                    if (NS_SUCCEEDED(result) && (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
                      mTextValue.Insert(aChar,0);
                      mTextValue.Append(aChar);
                    }
                    // According to spec. we ( who? ) should ignore linefeeds. But look,
                    // even the carriage return was getting stripped ( wonder why! ) -
                    // Ref. to bug 15204.  Okay, so the spec. told us to ignore linefeeds,
                    // bug then what about bug 47535 ? Should we preserve everything then?
                    // Well, let's make it so! Commenting out the next two lines..
                    /*if(!aRetain)
                      mTextValue.StripChars("\r\n"); //per the HTML spec, ignore linefeeds...
                    */
                  }
                  else if (kGreaterThan==aChar){
                    mHasEqualWithoutValue=PR_TRUE;
                  }
                  else {
                    static const nsReadEndCondition
                      theAttributeTerminator(kAttributeTerminalChars);
                    result=ConsumeAttributeValueText(mTextValue,
                                                     mNewlineCount,
                                                     aScanner,
                                                     theAttributeTerminator,
                                                     PR_FALSE,
                                                     aFlag);
                  }
                }//if
                if (NS_OK==result) {
                  if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
                    result = aScanner.ReadWhitespace(mTextValue, mNewlineCount);
                  }
                  else {
                    result = aScanner.SkipWhitespace(mNewlineCount);
                  }
                }
              }//if
            }//if
          }//if
          else {
            //This is where we have to handle fairly busted content.
            //If you're here, it means we saw an attribute name, but couldn't find
            //the following equal sign.  <tag NAME=....

            //Doing this right in all cases is <i>REALLY</i> ugly.
            //My best guess is to grab the next non-ws char. We know it's not '=',
            //so let's see what it is. If it's a '"', then assume we're reading
            //from the middle of the value. Try stripping the quote and continuing...
            if (kQuote==aChar){
              result=aScanner.SkipOver(aChar); //strip quote.
            }
          }
        }//if
      } //if
    }//if (consume optional value)

    if (NS_OK==result) {
      result=aScanner.Peek(aChar);
#ifdef DEBUG
      mLastAttribute = (kGreaterThan == aChar || kEOF == result);
#endif
    }
  }//if
  return result;
}

void CAttributeToken::SetKey(const nsAString& aKey)
{
  mTextKey.Rebind(aKey);
}

void CAttributeToken::BindKey(nsScanner* aScanner,
                              nsScannerIterator& aStart,
                              nsScannerIterator& aEnd)
{
  aScanner->BindSubstring(mTextKey, aStart, aEnd);
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CWhitespaceToken::CWhitespaceToken() : CHTMLToken(eHTMLTag_whitespace) {
}


/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string value to init token name with
 *  @return
 */
CWhitespaceToken::CWhitespaceToken(const nsAString& aName) : CHTMLToken(eHTMLTag_whitespace) {
  mTextValue.Assign(aName);
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CWhitespaceToken::GetClassName(void) {
  return "ws";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CWhitespaceToken::GetTokenType(void) {
  return eToken_whitespace;
}

/*
 *  This general purpose method is used when you want to
 *  consume an aribrary sequence of whitespace.
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CWhitespaceToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
  mTextValue.Assign(aChar);
  nsresult result=aScanner.ReadWhitespace(mTextValue, mNewlineCount);
  if(NS_OK==result) {
    mTextValue.StripChar(kCR);
  }
  return result;
}

const nsAString& CWhitespaceToken::GetStringValue(void)
{
  return mTextValue;
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CEntityToken::CEntityToken() : CHTMLToken(eHTMLTag_entity) {
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string value to init token name with
 *  @return
 */
CEntityToken::CEntityToken(const nsAString& aName) : CHTMLToken(eHTMLTag_entity) {
  mTextValue.Assign(aName);
#ifdef VERBOSE_DEBUG
  if(!VerifyEntityTable())  {
    cout<<"Entity table is invalid!" << endl;
  }
#endif
}


/*
 *  Consume the rest of the entity. We've already eaten the "&".
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult CEntityToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
  nsresult result=ConsumeEntity(aChar,mTextValue,aScanner);
  return result;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CEntityToken::GetClassName(void) {
  return "&entity";
}


/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CEntityToken::GetTokenType(void) {
  return eToken_entity;
}

/*
 *  This general purpose method is used when you want to
 *  consume an entity &xxxx;. Keep in mind that entities
 *  are <i>not</i> reduced inline.
 *
 *  @update  gess 3/25/98
 *  @param   aChar -- last char consumed from stream
 *  @param   aScanner -- controller of underlying input source
 *  @return  error result
 */
nsresult
CEntityToken::ConsumeEntity(PRUnichar aChar,
                            nsString& aString,
                            nsScanner& aScanner) {
  nsresult result=NS_OK;
  if(kLeftBrace==aChar) {
    //you're consuming a script entity...
    aScanner.GetChar(aChar); // Consume &

    PRInt32 rightBraceCount = 0;
    PRInt32 leftBraceCount  = 0;

    do {
      result=aScanner.GetChar(aChar);

      if (NS_FAILED(result)) {
        return result;
      }

      aString.Append(aChar);
      if(aChar==kRightBrace)
        ++rightBraceCount;
      else if(aChar==kLeftBrace)
        ++leftBraceCount;
    } while(leftBraceCount!=rightBraceCount);
  } //if
  else {
    PRUnichar theChar=0;
    if (kHashsign==aChar) {
      result = aScanner.Peek(theChar,2);

      if (NS_FAILED(result)) {
        if (kEOF == result && !aScanner.IsIncremental()) {
          // If this is the last buffer then we are certainly
          // not dealing with an entity. That's, there are
          // no more characters after &#. Bug 188278.
          return NS_HTMLTOKENS_NOT_AN_ENTITY;
        }
        return result;
      }

      if (nsCRT::IsAsciiDigit(theChar)) {
        aScanner.GetChar(aChar); // Consume &
        aScanner.GetChar(aChar); // Consume #
        aString.Assign(aChar);
        result=aScanner.ReadNumber(aString,10);
      }
      else if (theChar == 'x' || theChar == 'X') {
        aScanner.GetChar(aChar);   // Consume &
        aScanner.GetChar(aChar);   // Consume #
        aScanner.GetChar(theChar); // Consume x
        aString.Assign(aChar);
        aString.Append(theChar);
        result=aScanner.ReadNumber(aString,16);
      }
      else {
        return NS_HTMLTOKENS_NOT_AN_ENTITY;
      }
    }
    else {
      result = aScanner.Peek(theChar,1);

      if (NS_FAILED(result)) {
        return result;
      }

      if(nsCRT::IsAsciiAlpha(theChar) ||
        theChar == '_' ||
        theChar == ':') {
        aScanner.GetChar(aChar); // Consume &
        result=aScanner.ReadIdentifier(aString,PR_TRUE); // Ref. Bug# 23791 - For setting aIgnore to PR_TRUE.
      }
      else {
        return NS_HTMLTOKENS_NOT_AN_ENTITY;
      }
    }
  }

  if (NS_FAILED(result)) {
    return result;
  }

  result=aScanner.Peek(aChar);

  if (NS_FAILED(result)) {
    return result;
  }

  if (aChar == kSemicolon) {
    // consume semicolon that stopped the scan
    aString.Append(aChar);
    result=aScanner.GetChar(aChar);
  }

  return result;
}

#define PA_REMAP_128_TO_160_ILLEGAL_NCR 1

#ifdef PA_REMAP_128_TO_160_ILLEGAL_NCR
/**
 * Map some illegal but commonly used numeric entities into their
 * appropriate unicode value.
 */
#define NOT_USED 0xfffd

static const PRUint16 PA_HackTable[] = {
	0x20ac,  /* EURO SIGN */
	NOT_USED,
	0x201a,  /* SINGLE LOW-9 QUOTATION MARK */
	0x0192,  /* LATIN SMALL LETTER F WITH HOOK */
	0x201e,  /* DOUBLE LOW-9 QUOTATION MARK */
	0x2026,  /* HORIZONTAL ELLIPSIS */
	0x2020,  /* DAGGER */
	0x2021,  /* DOUBLE DAGGER */
	0x02c6,  /* MODIFIER LETTER CIRCUMFLEX ACCENT */
	0x2030,  /* PER MILLE SIGN */
	0x0160,  /* LATIN CAPITAL LETTER S WITH CARON */
	0x2039,  /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
	0x0152,  /* LATIN CAPITAL LIGATURE OE */
	NOT_USED,
	0x017D,  /* LATIN CAPITAL LETTER Z WITH CARON */
	NOT_USED,
	NOT_USED,
	0x2018,  /* LEFT SINGLE QUOTATION MARK */
	0x2019,  /* RIGHT SINGLE QUOTATION MARK */
	0x201c,  /* LEFT DOUBLE QUOTATION MARK */
	0x201d,  /* RIGHT DOUBLE QUOTATION MARK */
	0x2022,  /* BULLET */
	0x2013,  /* EN DASH */
	0x2014,  /* EM DASH */
	0x02dc,  /* SMALL TILDE */
	0x2122,  /* TRADE MARK SIGN */
	0x0161,  /* LATIN SMALL LETTER S WITH CARON */
	0x203a,  /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
	0x0153,  /* LATIN SMALL LIGATURE OE */
	NOT_USED,
	0x017E,  /* LATIN SMALL LETTER Z WITH CARON */
	0x0178   /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
};
#endif /* PA_REMAP_128_TO_160_ILLEGAL_NCR */

static void AppendNCR(nsString& aString, PRInt32 aNCRValue)
{
#ifdef PA_REMAP_128_TO_160_ILLEGAL_NCR
  /* for some illegal, but popular usage */
  if ((aNCRValue >= 0x0080) && (aNCRValue <= 0x009f)) {
    aNCRValue = PA_HackTable[aNCRValue - 0x0080];
  }
#endif

  if (IS_IN_BMP(aNCRValue))
    aString.Append(PRUnichar(aNCRValue));
  else {
    aString.Append(PRUnichar(H_SURROGATE(aNCRValue)));
    aString.Append(PRUnichar(L_SURROGATE(aNCRValue)));
  }
}

/*
 *  This method converts this entity into its underlying
 *  unicode equivalent.
 *
 *  @update  gess 3/25/98
 *  @param   aString will hold the resulting string value
 *  @return  numeric (unichar) value
 */
PRInt32 CEntityToken::TranslateToUnicodeStr(nsString& aString) {
  PRInt32 value=0;

  if(mTextValue.Length()>1) {
    PRUnichar theChar0=mTextValue.CharAt(0);

    if(kHashsign==theChar0) {
      PRInt32 err=0;

      value=mTextValue.ToInteger(&err,kAutoDetect);

      if(0==err) {
        AppendNCR(aString, value);
      }
    }
    else{
      value = nsHTMLEntities::EntityToUnicode(mTextValue);
      if(-1<value) {
        //we found a named entity...
        aString.Assign(PRUnichar(value));
      }
    }//else
  }//if

  return value;
}


const nsAString& CEntityToken::GetStringValue(void)
{
  return mTextValue;
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param   anOutputString will recieve the result
 *  @return  nada
 */
void CEntityToken::GetSource(nsString& anOutputString){
  anOutputString.Append(NS_LITERAL_STRING("&"));
  anOutputString+=mTextValue;
  //anOutputString+=";";
}

/*
 *
 *
 *  @update  harishd 03/23/00
 *  @param   result appended to the output string.
 *  @return  nada
 */
void CEntityToken::AppendSourceTo(nsAString& anOutputString){
  anOutputString.Append(NS_LITERAL_STRING("&"));
  anOutputString+=mTextValue;
  //anOutputString+=";";
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CScriptToken::CScriptToken() : CHTMLToken(eHTMLTag_script) {
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CScriptToken::CScriptToken(const nsAString& aString) : CHTMLToken(eHTMLTag_script) {
  mTextValue.Assign(aString);
}


/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CScriptToken::GetClassName(void) {
  return "script";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CScriptToken::GetTokenType(void) {
  return eToken_script;
}

const nsAString& CScriptToken::GetStringValue(void)
{
  return mTextValue;
}

/*
 *  default constructor
 *
 *  @update  gess 3/25/98
 *  @param   aName -- string to init token name with
 *  @return
 */
CStyleToken::CStyleToken() : CHTMLToken(eHTMLTag_style) {
}

CStyleToken::CStyleToken(const nsAString& aString) : CHTMLToken(eHTMLTag_style) {
  mTextValue.Assign(aString);
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
const char*  CStyleToken::GetClassName(void) {
  return "style";
}

/*
 *
 *
 *  @update  gess 3/25/98
 *  @param
 *  @return
 */
PRInt32 CStyleToken::GetTokenType(void) {
  return eToken_style;
}

const nsAString& CStyleToken::GetStringValue(void)
{
  return mTextValue;
}


/**
 *
 * @update	gess4/25/98
 * @param
 * @return
 */
const PRUnichar* GetTagName(PRInt32 aTag)
{
  const PRUnichar *result = nsHTMLTags::GetStringValue((nsHTMLTag) aTag);

  if (result) {
    return result;
  }

  if(aTag >= eHTMLTag_userdefined)
    return sUserdefined;

  return 0;
}


/**
 *
 *
 *  @update  gess 9/23/98
 *  @param
 *  @return
 */
CInstructionToken::CInstructionToken() : CHTMLToken(eHTMLTag_instruction) {
}

/**
 *
 *
 *  @update  gess 9/23/98
 *  @param
 *  @return
 */
CInstructionToken::CInstructionToken(const nsAString& aString) : CHTMLToken(eHTMLTag_unknown) {
  mTextValue.Assign(aString);
}

/**
 *
 *
 *  @update  gess 9/23/98
 *  @param
 *  @return
 */
nsresult CInstructionToken::Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aFlag){
  mTextValue.Assign(NS_LITERAL_STRING("<?"));
  nsresult result=aScanner.ReadUntil(mTextValue,kGreaterThan,PR_TRUE);
  return result;
}

/**
 *
 *
 *  @update  gess 9/23/98
 *  @param
 *  @return
 */
const char* CInstructionToken::GetClassName(void){
  return "instruction";
}

/**
 *
 *
 *  @update  gess 9/23/98
 *  @param
 *  @return
 */
PRInt32 CInstructionToken::GetTokenType(void){
  return eToken_instruction;
}

const nsAString& CInstructionToken::GetStringValue(void)
{
  return mTextValue;
}


CErrorToken::CErrorToken(nsParserError *aError) : CHTMLToken(eHTMLTag_unknown)
{
  mError = aError;
}

CErrorToken::~CErrorToken()
{
  delete mError;
}

PRInt32 CErrorToken::GetTokenType(void){
  return eToken_error;
}

const char* CErrorToken::GetClassName(void){
  return "error";
}

void CErrorToken::SetError(nsParserError *aError) {
  mError = aError;
}

const nsParserError * CErrorToken::GetError(void)
{
  return mError;
}

const nsAString& CErrorToken::GetStringValue(void)
{
  return mTextValue;
}

// Doctype decl token

CDoctypeDeclToken::CDoctypeDeclToken(eHTMLTags aTag)
  : CHTMLToken(aTag) {
}

CDoctypeDeclToken::CDoctypeDeclToken(const nsAString& aString,eHTMLTags aTag)
  : CHTMLToken(aTag), mTextValue(aString) {
}

/**
 *  This method consumes a doctype element.
 *  Note: I'm rewriting this method to seek to the first <, since quotes can really screw us up.
 *
 *  @update  gess 9/23/98
 *  @param
 *  @return
 */
nsresult CDoctypeDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {

  static const PRUnichar terminalChars[] =
  { PRUnichar('>'), PRUnichar('<'),
    PRUnichar(0)
  };
  static const nsReadEndCondition theEndCondition(terminalChars);

  nsScannerIterator start, end;

  aScanner.CurrentPosition(start);
  aScanner.EndReading(end);

  nsresult result=aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);

  if (NS_SUCCEEDED(result)) {
    PRUnichar ch;
    aScanner.Peek(ch);
    if (ch == kGreaterThan) {
      // Include '>' but not '<' since '<'
      // could belong to another tag.
      aScanner.GetChar(ch);
      end.advance(1);
    }
  }
  else if (!aScanner.IsIncremental()) {
    // We have reached the document end but haven't
    // found either a '<' or a '>'. Therefore use
    // whatever we have.
    result = NS_OK;
  }

  if (NS_SUCCEEDED(result)) {
    start.advance(-2); // Make sure to consume <!
    CopyUnicodeTo(start,end,mTextValue);
  }

  return result;
}

const char*  CDoctypeDeclToken::GetClassName(void) {
  return "doctype";
}

PRInt32 CDoctypeDeclToken::GetTokenType(void) {
  return eToken_doctypeDecl;
}

const nsAString& CDoctypeDeclToken::GetStringValue(void)
{
  return mTextValue;
}

void CDoctypeDeclToken::SetStringValue(const nsAString& aStr)
{
  mTextValue.Assign(aStr);
}