Bug 506090 - Speed up the tokenization of named character references in the HTML5 parser. r=bnewman.

This commit is contained in:
Henri Sivonen
2010-02-10 11:23:35 +02:00
parent 8a72d38cad
commit 382fdd2350
8 changed files with 2575 additions and 2216 deletions

View File

@@ -1,6 +1,6 @@
/*
* Copyright (c) 2005-2007 Henri Sivonen
* Copyright (c) 2007-2009 Mozilla Foundation
* Copyright (c) 2007-2010 Mozilla Foundation
* Portions of comments Copyright 2004-2008 Apple Computer, Inc., Mozilla
* Foundation, and Opera Software ASA.
*
@@ -2350,46 +2350,70 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
reconsume = PR_TRUE;
goto stateloop;
}
entCol = -1;
lo = 0;
hi = (nsHtml5NamedCharacters::NAMES.length - 1);
candidate = -1;
strBufMark = 0;
state = NS_HTML5TOKENIZER_CHARACTER_REFERENCE_LOOP;
reconsume = PR_TRUE;
if (c >= 'a' && c <= 'z') {
firstCharKey = c - 'a' + 26;
} else if (c >= 'A' && c <= 'Z') {
firstCharKey = c - 'A';
} else {
emitOrAppendStrBuf(returnState);
if (!(returnState & (~1))) {
cstart = pos;
}
state = returnState;
reconsume = PR_TRUE;
goto stateloop;
}
appendStrBuf(c);
state = NS_HTML5TOKENIZER_CHARACTER_REFERENCE_HILO_LOOKUP;
}
}
}
case NS_HTML5TOKENIZER_CHARACTER_REFERENCE_LOOP: {
for (; ; ) {
if (reconsume) {
reconsume = PR_FALSE;
} else {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
case NS_HTML5TOKENIZER_CHARACTER_REFERENCE_HILO_LOOKUP: {
{
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
if (c == '\0') {
goto stateloop_end;
}
PRInt32 hilo = 0;
if (c <= 'z') {
const PRInt32* row = nsHtml5NamedCharacters::HILO_ACCEL[c];
if (!!row) {
hilo = row[firstCharKey];
}
}
if (!hilo) {
emitOrAppendStrBuf(returnState);
if (!(returnState & (~1))) {
cstart = pos;
}
state = returnState;
reconsume = PR_TRUE;
goto stateloop;
}
appendStrBuf(c);
lo = hilo & 0xFFFF;
hi = hilo >> 16;
entCol = -1;
candidate = -1;
strBufMark = 0;
state = NS_HTML5TOKENIZER_CHARACTER_REFERENCE_TAIL;
}
}
case NS_HTML5TOKENIZER_CHARACTER_REFERENCE_TAIL: {
for (; ; ) {
if (++pos == endPos) {
goto stateloop_end;
}
c = checkChar(buf, pos);
if (c == '\0') {
goto stateloop_end;
}
entCol++;
for (; ; ) {
if (hi == -1) {
goto hiloop_end;
}
if (entCol == nsHtml5NamedCharacters::NAMES[hi].length) {
goto hiloop_end;
}
if (entCol > nsHtml5NamedCharacters::NAMES[hi].length) {
goto outer_end;
} else if (c < nsHtml5NamedCharacters::NAMES[hi][entCol]) {
hi--;
} else {
goto hiloop_end;
}
}
hiloop_end: ;
for (; ; ) {
if (hi < lo) {
goto outer_end;
@@ -2407,6 +2431,22 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
}
loloop_end: ;
for (; ; ) {
if (hi < lo) {
goto outer_end;
}
if (entCol == nsHtml5NamedCharacters::NAMES[hi].length) {
goto hiloop_end;
}
if (entCol > nsHtml5NamedCharacters::NAMES[hi].length) {
goto outer_end;
} else if (c < nsHtml5NamedCharacters::NAMES[hi][entCol]) {
hi--;
} else {
goto hiloop_end;
}
}
hiloop_end: ;
if (hi < lo) {
goto outer_end;
}
@@ -2424,8 +2464,8 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
reconsume = PR_TRUE;
goto stateloop;
} else {
jArray<PRUnichar,PRInt32> candidateArr = nsHtml5NamedCharacters::NAMES[candidate];
if (candidateArr[candidateArr.length - 1] != ';') {
jArray<PRInt8,PRInt32> candidateArr = nsHtml5NamedCharacters::NAMES[candidate];
if (!candidateArr.length || candidateArr[candidateArr.length - 1] != ';') {
if ((returnState & (~1))) {
PRUnichar ch;
if (strBufMark == strBufLen) {
@@ -2447,8 +2487,12 @@ nsHtml5Tokenizer::stateLoop(PRInt32 state, PRUnichar c, PRInt32 pos, PRUnichar*
}
}
jArray<PRUnichar,PRInt32> val = nsHtml5NamedCharacters::VALUES[candidate];
emitOrAppend(val, returnState);
const PRUnichar* val = nsHtml5NamedCharacters::VALUES[candidate];
if ((val[0] & 0xFC00) == 0xD800) {
emitOrAppendTwo(val, returnState);
} else {
emitOrAppendOne(val, returnState);
}
if (strBufMark < strBufLen) {
if ((returnState & (~1))) {
for (PRInt32 i = strBufMark; i < strBufLen; i++) {
@@ -3435,7 +3479,7 @@ nsHtml5Tokenizer::handleNcrValue(PRInt32 returnState)
} else if (value <= 0x10FFFF) {
astralChar[0] = (PRUnichar) (NS_HTML5TOKENIZER_LEAD_OFFSET + (value >> 10));
astralChar[1] = (PRUnichar) (0xDC00 + (value & 0x3FF));
emitOrAppend(astralChar, returnState);
emitOrAppendTwo(astralChar, returnState);
} else {
emitOrAppendOne(nsHtml5Tokenizer::REPLACEMENT_CHARACTER, returnState);
@@ -3626,7 +3670,13 @@ nsHtml5Tokenizer::eof()
state = returnState;
continue;
}
case NS_HTML5TOKENIZER_CHARACTER_REFERENCE_LOOP: {
case NS_HTML5TOKENIZER_CHARACTER_REFERENCE_HILO_LOOKUP: {
emitOrAppendStrBuf(returnState);
state = returnState;
continue;
}
case NS_HTML5TOKENIZER_CHARACTER_REFERENCE_TAIL: {
for (; ; ) {
PRUnichar c = '\0';
entCol++;
@@ -3675,8 +3725,8 @@ nsHtml5Tokenizer::eof()
state = returnState;
goto eofloop;
} else {
jArray<PRUnichar,PRInt32> candidateArr = nsHtml5NamedCharacters::NAMES[candidate];
if (candidateArr[candidateArr.length - 1] != ';') {
jArray<PRInt8,PRInt32> candidateArr = nsHtml5NamedCharacters::NAMES[candidate];
if (!candidateArr.length || candidateArr[candidateArr.length - 1] != ';') {
if ((returnState & (~1))) {
PRUnichar ch;
if (strBufMark == strBufLen) {
@@ -3697,8 +3747,12 @@ nsHtml5Tokenizer::eof()
}
}
jArray<PRUnichar,PRInt32> val = nsHtml5NamedCharacters::VALUES[candidate];
emitOrAppend(val, returnState);
const PRUnichar* val = nsHtml5NamedCharacters::VALUES[candidate];
if ((val[0] & 0xFC00) == 0xD800) {
emitOrAppendTwo(val, returnState);
} else {
emitOrAppendOne(val, returnState);
}
if (strBufMark < strBufLen) {
if ((returnState & (~1))) {
for (PRInt32 i = strBufMark; i < strBufLen; i++) {
@@ -3760,17 +3814,18 @@ nsHtml5Tokenizer::internalEncodingDeclaration(nsString* internalCharset)
}
void
nsHtml5Tokenizer::emitOrAppend(jArray<PRUnichar,PRInt32> val, PRInt32 returnState)
nsHtml5Tokenizer::emitOrAppendTwo(const PRUnichar* val, PRInt32 returnState)
{
if ((returnState & (~1))) {
appendLongStrBuf(val);
appendLongStrBuf(val[0]);
appendLongStrBuf(val[1]);
} else {
tokenHandler->characters(val, 0, val.length);
tokenHandler->characters(val, 0, 2);
}
}
void
nsHtml5Tokenizer::emitOrAppendOne(PRUnichar* val, PRInt32 returnState)
nsHtml5Tokenizer::emitOrAppendOne(const PRUnichar* val, PRInt32 returnState)
{
if ((returnState & (~1))) {
appendLongStrBuf(val[0]);
@@ -3865,6 +3920,7 @@ nsHtml5Tokenizer::resetToDataState()
forceQuirks = PR_FALSE;
additional = '\0';
entCol = -1;
firstCharKey = -1;
lo = 0;
hi = (nsHtml5NamedCharacters::NAMES.length - 1);
candidate = -1;
@@ -3913,6 +3969,7 @@ nsHtml5Tokenizer::loadState(nsHtml5Tokenizer* other)
forceQuirks = other->forceQuirks;
additional = other->additional;
entCol = other->entCol;
firstCharKey = other->firstCharKey;
lo = other->lo;
hi = other->hi;
candidate = other->candidate;