Bug 509009, bug 497848 - Update the HTML5 tokenizer to spec (excluding script changes) as of 2009-11-24. rs=sicking.

This commit is contained in:
Henri Sivonen
2009-11-30 17:34:51 +02:00
parent c43e74a76d
commit 48e31b0668
7 changed files with 1488 additions and 1425 deletions

View File

@@ -443,8 +443,9 @@ nsHtml5Tokenizer::tokenizeBuffer(nsHtml5UTF16Buffer* buffer)
switch(state) {
case NS_HTML5TOKENIZER_DATA:
case NS_HTML5TOKENIZER_RCDATA:
case NS_HTML5TOKENIZER_CDATA:
case NS_HTML5TOKENIZER_SCRIPT_DATA:
case NS_HTML5TOKENIZER_PLAINTEXT:
case NS_HTML5TOKENIZER_RAWTEXT:
case NS_HTML5TOKENIZER_CDATA_SECTION:
case NS_HTML5TOKENIZER_ESCAPE:
case NS_HTML5TOKENIZER_ESCAPE_EXCLAMATION:
@@ -538,7 +539,7 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
goto stateloop;
}
case '/': {
state = NS_HTML5TOKENIZER_CLOSE_TAG_OPEN_PCDATA;
state = NS_HTML5TOKENIZER_CLOSE_TAG_OPEN;
goto stateloop;
}
case '\?': {
@@ -779,6 +780,7 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
case '<':
case '=':
case '`':
default: {
clearLongStrBufAndAppendCurrentC(c);
state = NS_HTML5TOKENIZER_ATTRIBUTE_VALUE_UNQUOTED;
@@ -943,6 +945,7 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
case '\"':
case '\'':
case '=':
case '`':
default: {
appendLongStrBuf(c);
@@ -1636,14 +1639,14 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
index++;
continue;
} else {
state = NS_HTML5TOKENIZER_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
state = NS_HTML5TOKENIZER_AFTER_DOCTYPE_PUBLIC_KEYWORD;
reconsume = PR_TRUE;
goto doctypeublicloop_end;
}
}
doctypeublicloop_end: ;
}
case NS_HTML5TOKENIZER_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: {
case NS_HTML5TOKENIZER_AFTER_DOCTYPE_PUBLIC_KEYWORD: {
for (; ; ) {
if (reconsume) {
reconsume = PR_FALSE;
@@ -1653,6 +1656,55 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
c = checkChar(buf, pos);
}
switch(c) {
case '\r': {
silentCarriageReturn();
state = NS_HTML5TOKENIZER_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
goto stateloop_end;
}
case '\n': {
silentLineFeed();
}
case ' ':
case '\t':
case '\f': {
state = NS_HTML5TOKENIZER_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
goto afterdoctypepublickeywordloop_end;
}
case '\"': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED;
goto stateloop;
}
case '\'': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
goto stateloop;
}
case '>': {
forceQuirks = PR_TRUE;
emitDoctypeToken(pos);
state = NS_HTML5TOKENIZER_DATA;
goto stateloop;
}
default: {
bogusDoctype();
state = NS_HTML5TOKENIZER_BOGUS_DOCTYPE;
goto stateloop;
}
}
}
afterdoctypepublickeywordloop_end: ;
}
case NS_HTML5TOKENIZER_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
switch(c) {
case '\r': {
silentCarriageReturn();
@@ -1740,6 +1792,7 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
switch(c) {
case '\r': {
silentCarriageReturn();
state = NS_HTML5TOKENIZER_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
goto stateloop_end;
}
case '\n': {
@@ -1748,23 +1801,26 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
case ' ':
case '\t':
case '\f': {
continue;
}
case '\"': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
state = NS_HTML5TOKENIZER_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
goto afterdoctypepublicidentifierloop_end;
}
case '\'': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
goto stateloop;
}
case '>': {
emitDoctypeToken(pos);
state = NS_HTML5TOKENIZER_DATA;
goto stateloop;
}
case '\"': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
goto stateloop;
}
case '\'': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
goto stateloop;
}
default: {
bogusDoctype();
state = NS_HTML5TOKENIZER_BOGUS_DOCTYPE;
@@ -1774,6 +1830,49 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
afterdoctypepublicidentifierloop_end: ;
}
case NS_HTML5TOKENIZER_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
switch(c) {
case '\r': {
silentCarriageReturn();
goto stateloop_end;
}
case '\n': {
silentLineFeed();
}
case ' ':
case '\t':
case '\f': {
continue;
}
case '>': {
emitDoctypeToken(pos);
state = NS_HTML5TOKENIZER_DATA;
goto stateloop;
}
case '\"': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
goto betweendoctypepublicandsystemidentifiersloop_end;
}
case '\'': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
goto stateloop;
}
default: {
bogusDoctype();
state = NS_HTML5TOKENIZER_BOGUS_DOCTYPE;
goto stateloop;
}
}
}
betweendoctypepublicandsystemidentifiersloop_end: ;
}
case NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: {
for (; ; ) {
if (++pos == endPos) {
@@ -1895,14 +1994,14 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
index++;
goto stateloop;
} else {
state = NS_HTML5TOKENIZER_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
state = NS_HTML5TOKENIZER_AFTER_DOCTYPE_SYSTEM_KEYWORD;
reconsume = PR_TRUE;
goto doctypeystemloop_end;
}
}
doctypeystemloop_end: ;
}
case NS_HTML5TOKENIZER_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: {
case NS_HTML5TOKENIZER_AFTER_DOCTYPE_SYSTEM_KEYWORD: {
for (; ; ) {
if (reconsume) {
reconsume = PR_FALSE;
@@ -1912,6 +2011,55 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
c = checkChar(buf, pos);
}
switch(c) {
case '\r': {
silentCarriageReturn();
state = NS_HTML5TOKENIZER_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
goto stateloop_end;
}
case '\n': {
silentLineFeed();
}
case ' ':
case '\t':
case '\f': {
state = NS_HTML5TOKENIZER_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
goto afterdoctypesystemkeywordloop_end;
}
case '\"': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
goto stateloop;
}
case '\'': {
clearLongStrBufForNextState();
state = NS_HTML5TOKENIZER_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED;
goto stateloop;
}
case '>': {
forceQuirks = PR_TRUE;
emitDoctypeToken(pos);
state = NS_HTML5TOKENIZER_DATA;
goto stateloop;
}
default: {
bogusDoctype();
state = NS_HTML5TOKENIZER_BOGUS_DOCTYPE;
goto stateloop;
}
}
}
afterdoctypesystemkeywordloop_end: ;
}
case NS_HTML5TOKENIZER_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
switch(c) {
case '\r': {
silentCarriageReturn();
@@ -2500,7 +2648,7 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
}
case NS_HTML5TOKENIZER_CDATA: {
case NS_HTML5TOKENIZER_SCRIPT_DATA: {
for (; ; ) {
if (reconsume) {
reconsume = PR_FALSE;
@@ -2514,8 +2662,8 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
case '<': {
flushChars(buf, pos);
returnState = state;
state = NS_HTML5TOKENIZER_TAG_OPEN_NON_PCDATA;
goto cdataloop_end;
state = NS_HTML5TOKENIZER_SCRIPT_DATA_LESS_THAN_SIGN_STATE;
goto scriptdataloop_end;
}
case '\0': {
emitReplacementCharacter(buf, pos);
@@ -2533,39 +2681,37 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
}
}
cdataloop_end: ;
scriptdataloop_end: ;
}
case NS_HTML5TOKENIZER_TAG_OPEN_NON_PCDATA: {
case NS_HTML5TOKENIZER_SCRIPT_DATA_LESS_THAN_SIGN_STATE: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
switch(c) {
case '/': {
index = 0;
clearStrBufForNextState();
state = NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME;
goto stateloop;
}
case '!': {
tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1);
cstart = pos;
state = NS_HTML5TOKENIZER_ESCAPE_EXCLAMATION;
goto tagopennonpcdataloop_end;
}
case '/': {
if (!!contentModelElement) {
index = 0;
clearStrBufForNextState();
state = NS_HTML5TOKENIZER_CLOSE_TAG_OPEN_NOT_PCDATA;
goto stateloop;
}
goto scriptdatalessthansignloop_end;
}
default: {
tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1);
cstart = pos;
state = returnState;
state = NS_HTML5TOKENIZER_SCRIPT_DATA;
reconsume = PR_TRUE;
goto stateloop;
}
}
}
tagopennonpcdataloop_end: ;
scriptdatalessthansignloop_end: ;
}
case NS_HTML5TOKENIZER_ESCAPE_EXCLAMATION: {
for (; ; ) {
@@ -2703,74 +2849,7 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
}
case NS_HTML5TOKENIZER_CLOSE_TAG_OPEN_NOT_PCDATA: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
if (index < contentModelElementNameAsArray.length) {
PRUnichar e = contentModelElementNameAsArray[index];
PRUnichar folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded != e) {
tokenHandler->characters(nsHtml5Tokenizer::LT_SOLIDUS, 0, 2);
emitStrBuf();
cstart = pos;
state = returnState;
reconsume = PR_TRUE;
goto stateloop;
}
appendStrBuf(c);
index++;
continue;
} else {
endTag = PR_TRUE;
tagName = contentModelElement;
switch(c) {
case '\r': {
silentCarriageReturn();
state = NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME;
goto stateloop_end;
}
case '\n': {
silentLineFeed();
}
case ' ':
case '\t':
case '\f': {
state = NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME;
goto stateloop;
}
case '>': {
state = emitCurrentTagToken(PR_FALSE, pos);
if (shouldSuspend) {
goto stateloop_end;
}
goto stateloop;
}
case '/': {
state = NS_HTML5TOKENIZER_SELF_CLOSING_START_TAG;
goto stateloop;
}
default: {
tokenHandler->characters(nsHtml5Tokenizer::LT_SOLIDUS, 0, 2);
emitStrBuf();
if (c == '\0') {
emitReplacementCharacter(buf, pos);
} else {
cstart = pos;
}
state = returnState;
goto stateloop;
}
}
}
}
}
case NS_HTML5TOKENIZER_CLOSE_TAG_OPEN_PCDATA: {
case NS_HTML5TOKENIZER_CLOSE_TAG_OPEN: {
if (++pos == endPos) {
goto stateloop_end;
}
@@ -2839,7 +2918,7 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
case '<': {
flushChars(buf, pos);
returnState = state;
state = NS_HTML5TOKENIZER_TAG_OPEN_NON_PCDATA;
state = NS_HTML5TOKENIZER_RAWTEXT_RCDATA_LESS_THAN_SIGN_STATE;
goto stateloop;
}
case '\0': {
@@ -2860,6 +2939,132 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
}
case NS_HTML5TOKENIZER_RAWTEXT: {
for (; ; ) {
if (reconsume) {
reconsume = PR_FALSE;
} else {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
}
switch(c) {
case '<': {
flushChars(buf, pos);
returnState = state;
state = NS_HTML5TOKENIZER_RAWTEXT_RCDATA_LESS_THAN_SIGN_STATE;
goto rawtextloop_end;
}
case '\0': {
emitReplacementCharacter(buf, pos);
continue;
}
case '\r': {
emitCarriageReturn(buf, pos);
goto stateloop_end;
}
case '\n': {
silentLineFeed();
}
default: {
continue;
}
}
}
rawtextloop_end: ;
}
case NS_HTML5TOKENIZER_RAWTEXT_RCDATA_LESS_THAN_SIGN_STATE: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
switch(c) {
case '/': {
index = 0;
clearStrBufForNextState();
state = NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME;
goto rawtextrcdatalessthansignloop_end;
}
default: {
tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1);
cstart = pos;
state = returnState;
reconsume = PR_TRUE;
goto stateloop;
}
}
}
rawtextrcdatalessthansignloop_end: ;
}
case NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
if (index < contentModelElementNameAsArray.length) {
PRUnichar e = contentModelElementNameAsArray[index];
PRUnichar folded = c;
if (c >= 'A' && c <= 'Z') {
folded += 0x20;
}
if (folded != e) {
tokenHandler->characters(nsHtml5Tokenizer::LT_SOLIDUS, 0, 2);
emitStrBuf();
cstart = pos;
state = returnState;
reconsume = PR_TRUE;
goto stateloop;
}
appendStrBuf(c);
index++;
continue;
} else {
endTag = PR_TRUE;
tagName = contentModelElement;
switch(c) {
case '\r': {
silentCarriageReturn();
state = NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME;
goto stateloop_end;
}
case '\n': {
silentLineFeed();
}
case ' ':
case '\t':
case '\f': {
state = NS_HTML5TOKENIZER_BEFORE_ATTRIBUTE_NAME;
goto stateloop;
}
case '/': {
state = NS_HTML5TOKENIZER_SELF_CLOSING_START_TAG;
goto stateloop;
}
case '>': {
state = emitCurrentTagToken(PR_FALSE, pos);
if (shouldSuspend) {
goto stateloop_end;
}
goto stateloop;
}
default: {
tokenHandler->characters(nsHtml5Tokenizer::LT_SOLIDUS, 0, 2);
emitStrBuf();
if (c == '\0') {
emitReplacementCharacter(buf, pos);
} else {
cstart = pos;
}
state = returnState;
goto stateloop;
}
}
}
}
}
}
}
stateloop_end: ;
@@ -2943,17 +3148,11 @@ nsHtml5Tokenizer::handleNcrValue(PRInt32 returnState)
} else if (value == 0x0D) {
emitOrAppendOne(nsHtml5Tokenizer::LF, returnState);
} else if ((value >= 0x0000 && value <= 0x0008) || (value == 0x000B) || (value >= 0x000E && value <= 0x001F) || value == 0x007F) {
} else if (value == 0x0) {
emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState);
} else if ((value & 0xF800) == 0xD800) {
emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState);
} else if ((value & 0xFFFE) == 0xFFFE) {
emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState);
} else if (value >= 0xFDD0 && value <= 0xFDEF) {
emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState);
} else if (value <= 0xFFFF) {
PRUnichar ch = (PRUnichar) value;
@@ -2976,7 +3175,7 @@ nsHtml5Tokenizer::eof()
PRInt32 returnState = returnStateSave;
eofloop: for (; ; ) {
switch(state) {
case NS_HTML5TOKENIZER_TAG_OPEN_NON_PCDATA: {
case NS_HTML5TOKENIZER_SCRIPT_DATA_LESS_THAN_SIGN_STATE: {
tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1);
goto eofloop_end;
}
@@ -2985,7 +3184,7 @@ nsHtml5Tokenizer::eof()
tokenHandler->characters(nsHtml5Tokenizer::LT_GT, 0, 1);
goto eofloop_end;
}
case NS_HTML5TOKENIZER_CLOSE_TAG_OPEN_NOT_PCDATA: {
case NS_HTML5TOKENIZER_NON_DATA_END_TAG_NAME: {
if (index < contentModelElementNameAsArray.length) {
goto eofloop_end;
} else {
@@ -2993,7 +3192,7 @@ nsHtml5Tokenizer::eof()
goto eofloop_end;
}
}
case NS_HTML5TOKENIZER_CLOSE_TAG_OPEN_PCDATA: {
case NS_HTML5TOKENIZER_CLOSE_TAG_OPEN: {
tokenHandler->characters(nsHtml5Tokenizer::LT_SOLIDUS, 0, 2);
goto eofloop_end;
@@ -3104,6 +3303,8 @@ nsHtml5Tokenizer::eof()
case NS_HTML5TOKENIZER_DOCTYPE_UBLIC:
case NS_HTML5TOKENIZER_DOCTYPE_YSTEM:
case NS_HTML5TOKENIZER_AFTER_DOCTYPE_NAME:
case NS_HTML5TOKENIZER_AFTER_DOCTYPE_PUBLIC_KEYWORD:
case NS_HTML5TOKENIZER_AFTER_DOCTYPE_SYSTEM_KEYWORD:
case NS_HTML5TOKENIZER_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: {
forceQuirks = PR_TRUE;
@@ -3119,7 +3320,8 @@ nsHtml5Tokenizer::eof()
goto eofloop_end;
}
case NS_HTML5TOKENIZER_AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
case NS_HTML5TOKENIZER_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: {
case NS_HTML5TOKENIZER_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
case NS_HTML5TOKENIZER_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: {
forceQuirks = PR_TRUE;
emitDoctypeToken(0);