Bug 594730 - Make the content attribute in <meta> act as an encoding declaration only if http-equiv="Content-Type" is present. rs=jonas, a=blocking2.0-betaN.

This commit is contained in:
Henri Sivonen
2010-12-08 14:37:19 +02:00
parent 84cb7c5086
commit e4808f1f2e
22 changed files with 471 additions and 203 deletions

View File

@@ -26,7 +26,7 @@ package nu.validator.htmlparser.impl;
import java.io.IOException;
import nu.validator.htmlparser.annotation.Auto;
import nu.validator.htmlparser.annotation.NoLength;
import nu.validator.htmlparser.annotation.Inline;
import nu.validator.htmlparser.common.ByteReadable;
import org.xml.sax.SAXException;
@@ -36,12 +36,22 @@ public abstract class MetaScanner {
/**
* Constant for "charset".
*/
private static final @NoLength char[] CHARSET = "charset".toCharArray();
private static final char[] CHARSET = "harset".toCharArray();
/**
* Constant for "content".
*/
private static final @NoLength char[] CONTENT = "content".toCharArray();
private static final char[] CONTENT = "ontent".toCharArray();
/**
* Constant for "http-equiv".
*/
private static final char[] HTTP_EQUIV = "ttp-equiv".toCharArray();
/**
* Constant for "content-type".
*/
private static final char[] CONTENT_TYPE = "content-type".toCharArray();
private static final int NO = 0;
@@ -93,6 +103,12 @@ public abstract class MetaScanner {
private static final int SELF_CLOSING_START_TAG = 20;
private static final int HTTP_EQUIV_NOT_SEEN = 0;
private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
private static final int HTTP_EQUIV_OTHER = 2;
/**
* The data source.
*/
@@ -106,12 +122,22 @@ public abstract class MetaScanner {
/**
* The current position in recognizing the attribute name "content".
*/
private int contentIndex = -1;
private int contentIndex = Integer.MAX_VALUE;
/**
* The current position in recognizing the attribute name "charset".
*/
private int charsetIndex = -1;
private int charsetIndex = Integer.MAX_VALUE;
/**
* The current position in recognizing the attribute name "http-equive".
*/
private int httpEquivIndex = Integer.MAX_VALUE;
/**
* The current position in recognizing the attribute value "content-type".
*/
private int contentTypeIndex = Integer.MAX_VALUE;
/**
* The tokenizer state.
@@ -128,24 +154,34 @@ public abstract class MetaScanner {
*/
private @Auto char[] strBuf;
// [NOCPP[
private String content;
private String charset;
private int httpEquivState;
/**
* @param source
* @param errorHandler
* @param publicId
* @param systemId
*/
public MetaScanner() {
this.readable = null;
this.metaState = NO;
this.contentIndex = -1;
this.charsetIndex = -1;
this.contentIndex = Integer.MAX_VALUE;
this.charsetIndex = Integer.MAX_VALUE;
this.httpEquivIndex = Integer.MAX_VALUE;
this.contentTypeIndex = Integer.MAX_VALUE;
this.stateSave = DATA;
strBufLen = 0;
strBuf = new char[36];
this.strBufLen = 0;
this.strBuf = new char[36];
this.content = null;
this.charset = null;
this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
}
@SuppressWarnings("unused") private void destructor() {
Portability.releaseString(content);
Portability.releaseString(charset);
}
// [NOCPP[
/**
* Reads a byte from the data source.
*
@@ -295,17 +331,32 @@ public abstract class MetaScanner {
state = MetaScanner.SELF_CLOSING_START_TAG;
continue stateloop;
case '>':
if (handleTag()) {
break stateloop;
}
state = DATA;
continue stateloop;
case 'c':
case 'C':
contentIndex = 0;
charsetIndex = 0;
httpEquivIndex = Integer.MAX_VALUE;
contentTypeIndex = Integer.MAX_VALUE;
state = MetaScanner.ATTRIBUTE_NAME;
break beforeattributenameloop;
case 'h':
case 'H':
contentIndex = Integer.MAX_VALUE;
charsetIndex = Integer.MAX_VALUE;
httpEquivIndex = 0;
contentTypeIndex = Integer.MAX_VALUE;
state = MetaScanner.ATTRIBUTE_NAME;
break beforeattributenameloop;
default:
contentIndex = -1;
charsetIndex = -1;
contentIndex = Integer.MAX_VALUE;
charsetIndex = Integer.MAX_VALUE;
httpEquivIndex = Integer.MAX_VALUE;
contentTypeIndex = Integer.MAX_VALUE;
state = MetaScanner.ATTRIBUTE_NAME;
break beforeattributenameloop;
// continue stateloop;
@@ -329,10 +380,14 @@ public abstract class MetaScanner {
continue stateloop;
case '=':
strBufLen = 0;
contentTypeIndex = 0;
state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
break attributenameloop;
// continue stateloop;
case '>':
if (handleTag()) {
break stateloop;
}
state = MetaScanner.DATA;
continue stateloop;
default:
@@ -340,20 +395,21 @@ public abstract class MetaScanner {
if (c >= 'A' && c <= 'Z') {
c += 0x20;
}
if (contentIndex == 6) {
contentIndex = -1;
} else if (contentIndex > -1
&& contentIndex < 6
&& (c == CONTENT[contentIndex + 1])) {
contentIndex++;
if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
++contentIndex;
} else {
contentIndex = Integer.MAX_VALUE;
}
if (charsetIndex == 6) {
charsetIndex = -1;
} else if (charsetIndex > -1
&& charsetIndex < 6
&& (c == CHARSET[charsetIndex + 1])) {
charsetIndex++;
if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
++charsetIndex;
} else {
charsetIndex = Integer.MAX_VALUE;
}
if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
++httpEquivIndex;
} else {
httpEquivIndex = Integer.MAX_VALUE;
}
}
continue;
}
@@ -378,12 +434,13 @@ public abstract class MetaScanner {
state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
continue stateloop;
case '>':
if (handleTag()) {
break stateloop;
}
state = MetaScanner.DATA;
continue stateloop;
default:
if (charsetIndex == 6 || contentIndex == 6) {
addToBuffer(c);
}
handleCharInAttributeValue(c);
state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
continue stateloop;
}
@@ -400,16 +457,12 @@ public abstract class MetaScanner {
case -1:
break stateloop;
case '"':
if (tryCharset()) {
break stateloop;
}
handleAttributeValue();
state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
break attributevaluedoublequotedloop;
// continue stateloop;
default:
if (metaState == A && (contentIndex == 6 || charsetIndex == 6)) {
addToBuffer(c);
}
handleCharInAttributeValue(c);
continue;
}
}
@@ -431,6 +484,9 @@ public abstract class MetaScanner {
break afterattributevaluequotedloop;
// continue stateloop;
case '>':
if (handleTag()) {
break stateloop;
}
state = MetaScanner.DATA;
continue stateloop;
default:
@@ -446,6 +502,9 @@ public abstract class MetaScanner {
case -1:
break stateloop;
case '>':
if (handleTag()) {
break stateloop;
}
state = MetaScanner.DATA;
continue stateloop;
default:
@@ -469,21 +528,18 @@ public abstract class MetaScanner {
case '\n':
case '\u000C':
if (tryCharset()) {
break stateloop;
}
handleAttributeValue();
state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
continue stateloop;
case '>':
if (tryCharset()) {
handleAttributeValue();
if (handleTag()) {
break stateloop;
}
state = MetaScanner.DATA;
continue stateloop;
default:
if (metaState == A && (contentIndex == 6 || charsetIndex == 6)) {
addToBuffer(c);
}
handleCharInAttributeValue(c);
continue;
}
}
@@ -500,16 +556,17 @@ public abstract class MetaScanner {
case '\u000C':
continue;
case '/':
if (tryCharset()) {
break stateloop;
}
handleAttributeValue();
state = MetaScanner.SELF_CLOSING_START_TAG;
continue stateloop;
case '=':
strBufLen = 0;
contentTypeIndex = 0;
state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
continue stateloop;
case '>':
if (tryCharset()) {
handleAttributeValue();
if (handleTag()) {
break stateloop;
}
state = MetaScanner.DATA;
@@ -656,15 +713,11 @@ public abstract class MetaScanner {
case -1:
break stateloop;
case '\'':
if (tryCharset()) {
break stateloop;
}
handleAttributeValue();
state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
continue stateloop;
default:
if (metaState == A && (contentIndex == 6 || charsetIndex == 6)) {
addToBuffer(c);
}
handleCharInAttributeValue(c);
continue;
}
}
@@ -691,6 +744,27 @@ public abstract class MetaScanner {
stateSave = state;
}
private void handleCharInAttributeValue(int c) {
if (metaState == A) {
if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
addToBuffer(c);
} else if (httpEquivIndex == HTTP_EQUIV.length) {
if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
++contentTypeIndex;
} else {
contentTypeIndex = Integer.MAX_VALUE;
}
}
}
}
@Inline private int toAsciiLowerCase(int c) {
if (c >= 'A' && c <= 'Z') {
return c + 0x20;
}
return c;
}
/**
* Adds a character to the accumulation buffer.
* @param c the character to add
@@ -709,28 +783,52 @@ public abstract class MetaScanner {
* @return <code>true</code> if successful
* @throws SAXException
*/
private boolean tryCharset() throws SAXException {
if (metaState != A || !(contentIndex == 6 || charsetIndex == 6)) {
return false;
private void handleAttributeValue() throws SAXException {
if (metaState != A) {
return;
}
String attVal = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
String candidateEncoding;
if (contentIndex == 6) {
candidateEncoding = TreeBuilder.extractCharsetFromContent(attVal);
Portability.releaseString(attVal);
} else {
candidateEncoding = attVal;
if (contentIndex == CONTENT.length && content == null) {
content = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
return;
}
if (candidateEncoding == null) {
return false;
if (charsetIndex == CHARSET.length && charset == null) {
charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
return;
}
boolean success = tryCharset(candidateEncoding);
Portability.releaseString(candidateEncoding);
contentIndex = -1;
charsetIndex = -1;
return success;
if (httpEquivIndex == HTTP_EQUIV.length
&& httpEquivState == HTTP_EQUIV_NOT_SEEN) {
httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
: HTTP_EQUIV_OTHER;
return;
}
}
private boolean handleTag() throws SAXException {
boolean stop = handleTagInner();
Portability.releaseString(content);
content = null;
Portability.releaseString(charset);
charset = null;
httpEquivState = HTTP_EQUIV_NOT_SEEN;
return stop;
}
private boolean handleTagInner() throws SAXException {
if (charset != null && tryCharset(charset)) {
return true;
}
if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
String extract = TreeBuilder.extractCharsetFromContent(content);
if (extract == null) {
return false;
}
boolean success = tryCharset(extract);
Portability.releaseString(extract);
return success;
}
return false;
}
/**
* Tries to switch to an encoding.
*
@@ -739,6 +837,5 @@ public abstract class MetaScanner {
* @throws SAXException
*/
protected abstract boolean tryCharset(String encoding) throws SAXException;
}