html: tokenize HTML comments.

I'm not sure if it's 100% correct wrt the HTML5 specification,
but the test suite has plenty of HTML comment test cases, and
we'll shake out any tokenization bugs as the parser improves its
coverage.

R=gri
CC=golang-dev
https://golang.org/cl/4186055
This commit is contained in:
Nigel Tao 2011-02-17 10:45:30 +11:00
parent 3a2d64789b
commit a5ff8ad9db
3 changed files with 152 additions and 54 deletions

View File

@ -69,6 +69,9 @@ call to Next. For example, to extract an HTML page's anchor text:
}
}
A Tokenizer typically skips over HTML comments. To return comment tokens, set
Tokenizer.ReturnComments to true before looping over calls to Next.
Parsing is done by calling Parse with an io.Reader, which returns the root of
the parse tree (the document element) as a *Node. It is the caller's
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For

View File

@ -25,6 +25,8 @@ const (
EndTagToken
// A SelfClosingTagToken tag looks like <br/>.
SelfClosingTagToken
// A CommentToken looks like <!--x-->.
CommentToken
)
// String returns a string representation of the TokenType.
@ -40,6 +42,8 @@ func (t TokenType) String() string {
return "EndTag"
case SelfClosingTagToken:
return "SelfClosingTag"
case CommentToken:
return "Comment"
}
return "Invalid(" + strconv.Itoa(int(t)) + ")"
}
@ -52,8 +56,8 @@ type Attribute struct {
}
// A Token consists of a TokenType and some Data (tag name for start and end
// tags, content for text). A tag Token may also contain a slice of Attributes.
// Data is unescaped for both tag and text Tokens (it looks like "a<b" rather
// tags, content for text and comments). A tag Token may also contain a slice
// of Attributes. Data is unescaped for all Tokens (it looks like "a<b" rather
// than "a&lt;b").
type Token struct {
Type TokenType
@ -91,12 +95,18 @@ func (t Token) String() string {
return "</" + t.tagString() + ">"
case SelfClosingTagToken:
return "<" + t.tagString() + "/>"
case CommentToken:
return "<!--" + EscapeString(t.Data) + "-->"
}
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
}
// A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct {
// If ReturnComments is set, Next returns comment tokens;
// otherwise it skips over comments (default).
ReturnComments bool
// r is the source of the HTML text.
r io.Reader
// tt is the TokenType of the most recently read token. If tt == Error
@ -176,6 +186,39 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
panic("unreachable")
}
// nextMarkupDeclaration returns the next TokenType starting with "<!".
func (z *Tokenizer) nextMarkupDeclaration() (TokenType, os.Error) {
// TODO: check for <!DOCTYPE ... >, don't just assume that it's a comment.
for i := 0; i < 2; i++ {
c, err := z.readByte()
if err != nil {
return TextToken, err
}
if c != '-' {
return z.nextText(), nil
}
}
// <!--> is a valid comment.
for dashCount := 2; ; {
c, err := z.readByte()
if err != nil {
return TextToken, err
}
switch c {
case '-':
dashCount++
case '>':
if dashCount >= 2 {
return CommentToken, nil
}
fallthrough
default:
dashCount = 0
}
}
panic("unreachable")
}
// nextTag returns the next TokenType starting from the tag open state.
func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
c, err := z.readByte()
@ -189,7 +232,7 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
tt = StartTagToken
case c == '!':
return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
return z.nextMarkupDeclaration()
case c == '?':
return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
default:
@ -221,22 +264,8 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
panic("unreachable")
}
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
z.p0 = z.p1
c, err := z.readByte()
if err != nil {
z.tt, z.err = ErrorToken, err
return z.tt
}
if c == '<' {
z.tt, z.err = z.nextTag()
return z.tt
}
// nextText reads all text up until an '<'.
func (z *Tokenizer) nextText() TokenType {
for {
c, err := z.readByte()
if err != nil {
@ -255,6 +284,31 @@ func (z *Tokenizer) Next() TokenType {
panic("unreachable")
}
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
for {
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
z.p0 = z.p1
c, err := z.readByte()
if err != nil {
z.tt, z.err = ErrorToken, err
return z.tt
}
if c == '<' {
z.tt, z.err = z.nextTag()
if z.tt == CommentToken && !z.ReturnComments {
continue
}
return z.tt
}
return z.nextText()
}
panic("unreachable")
}
// trim returns the largest j such that z.buf[i:j] contains only white space,
// or only white space plus the final ">" or "/>" of the raw data.
func (z *Tokenizer) trim(i int) int {
@ -299,12 +353,27 @@ loop:
return z.buf[i0:i], z.trim(i)
}
// Text returns the raw data after unescaping.
// Text returns the unescaped text of a TextToken or a CommentToken.
// The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte {
switch z.tt {
case TextToken:
s := unescape(z.Raw())
z.p0 = z.p1
return s
case CommentToken:
// We trim the "<!--" from the left and the "-->" from the right.
// "<!-->" is a valid comment, so the adjusted endpoints might overlap.
i0 := z.p0 + 4
i1 := z.p1 - 3
z.p0 = z.p1
var s []byte
if i0 < i1 {
s = unescape(z.buf[i0:i1])
}
return s
}
return nil
}
// TagName returns the lower-cased name of a tag token (the `img` out of
@ -372,7 +441,7 @@ loop:
func (z *Tokenizer) Token() Token {
t := Token{Type: z.tt}
switch z.tt {
case TextToken:
case TextToken, CommentToken:
t.Data = string(z.Text())
case StartTagToken, EndTagToken, SelfClosingTagToken:
var attr []Attribute

View File

@ -7,6 +7,7 @@ package html
import (
"bytes"
"os"
"strings"
"testing"
)
@ -15,8 +16,8 @@ type tokenTest struct {
desc string
// The HTML to parse.
html string
// The string representations of the expected tokens.
tokens []string
// The string representations of the expected tokens, joined by '$'.
golden string
}
var tokenTests = []tokenTest{
@ -25,61 +26,86 @@ var tokenTests = []tokenTest{
{
"text",
"foo bar",
[]string{
"foo bar",
},
},
// An entity.
{
"entity",
"one &lt; two",
[]string{
"one &lt; two",
},
},
// A start, self-closing and end tag. The tokenizer does not care if the start
// and end tokens don't match; that is the job of the parser.
{
"tags",
"<a>b<c/>d</e>",
[]string{
"<a>",
"b",
"<c/>",
"d",
"</e>",
"<a>$b$<c/>$d$</e>",
},
// Comments.
{
"comment0",
"abc<b><!-- skipme --></b>def",
"abc$<b>$</b>$def",
},
{
"comment1",
"a<!-->z",
"a$z",
},
{
"comment2",
"a<!--->z",
"a$z",
},
{
"comment3",
"a<!--x>-->z",
"a$z",
},
{
"comment4",
"a<!--x->-->z",
"a$z",
},
{
"comment5",
"a<!>z",
"a$&lt;!&gt;z",
},
{
"comment6",
"a<!->z",
"a$&lt;!-&gt;z",
},
{
"comment7",
"a<!---<>z",
"a$&lt;!---&lt;&gt;z",
},
{
"comment8",
"a<!--z",
"a$&lt;!--z",
},
// An attribute with a backslash.
{
"backslash",
`<p id="a\"b">`,
[]string{
`<p id="a&quot;b">`,
},
},
// Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag.
{
"tricky",
"<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
[]string{
`<p id="a&quot;B" foo="bar">`,
"<em>",
"te&lt;&amp;;xt",
"</em>",
"</p>",
},
`<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
},
// A non-existant entity. Tokenizing and converting back to a string should
// escape the "&" to become "&amp;".
{
"noSuchEntity",
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
[]string{
`<a b="c&amp;noSuchEntity;d">`,
"&lt;&amp;alsoDoesntExist;&amp;",
},
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
},
}
@ -87,7 +113,7 @@ func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
for i, s := range tt.tokens {
for i, s := range strings.Split(tt.golden, "$", -1) {
if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
continue loop