exp/html: replace NUL bytes in plaintext, raw text, and RCDATA

If NUL bytes occur inside certain elements, convert them to U+FFFD
replacement character.

Pass 1 additional test.

R=nigeltao
CC=golang-dev
https://golang.org/cl/6452047
This commit is contained in:
Andrew Balholm 2012-07-27 09:27:10 +10:00 committed by Nigel Tao
parent d399b681a4
commit 55f0c8b2cd
2 changed files with 15 additions and 1 deletions

View File

@ -7,7 +7,7 @@ PASS "<html>\x00\n <frameset></frameset>"
PASS "<html><select>\x00" PASS "<html><select>\x00"
PASS "\x00" PASS "\x00"
PASS "<body>\x00" PASS "<body>\x00"
FAIL "<plaintext>\x00filler\x00text\x00" PASS "<plaintext>\x00filler\x00text\x00"
FAIL "<svg><![CDATA[\x00filler\x00text\x00]]>" FAIL "<svg><![CDATA[\x00filler\x00text\x00]]>"
FAIL "<body><!\x00>" FAIL "<body><!\x00>"
FAIL "<body><!\x00filler\x00text>" FAIL "<body><!\x00filler\x00text>"

View File

@ -152,6 +152,9 @@ type Tokenizer struct {
rawTag string rawTag string
// textIsRaw is whether the current text token's data is not escaped. // textIsRaw is whether the current text token's data is not escaped.
textIsRaw bool textIsRaw bool
// convertNUL is whether NUL bytes in the current token's data should
// be converted into \ufffd replacement characters.
convertNUL bool
} }
// Err returns the error associated with the most recent ErrorToken token. // Err returns the error associated with the most recent ErrorToken token.
@ -597,16 +600,19 @@ func (z *Tokenizer) Next() TokenType {
for z.err == nil { for z.err == nil {
z.readByte() z.readByte()
} }
z.data.end = z.raw.end
z.textIsRaw = true z.textIsRaw = true
} else { } else {
z.readRawOrRCDATA() z.readRawOrRCDATA()
} }
if z.data.end > z.data.start { if z.data.end > z.data.start {
z.tt = TextToken z.tt = TextToken
z.convertNUL = true
return z.tt return z.tt
} }
} }
z.textIsRaw = false z.textIsRaw = false
z.convertNUL = false
loop: loop:
for { for {
@ -731,6 +737,11 @@ func convertNewlines(s []byte) []byte {
return s return s
} }
var (
nul = []byte("\x00")
replacement = []byte("\ufffd")
)
// Text returns the unescaped text of a text, comment or doctype token. The // Text returns the unescaped text of a text, comment or doctype token. The
// contents of the returned slice may change on the next call to Next. // contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte { func (z *Tokenizer) Text() []byte {
@ -740,6 +751,9 @@ func (z *Tokenizer) Text() []byte {
z.data.start = z.raw.end z.data.start = z.raw.end
z.data.end = z.raw.end z.data.end = z.raw.end
s = convertNewlines(s) s = convertNewlines(s)
if z.convertNUL && bytes.Contains(s, nul) {
s = bytes.Replace(s, nul, replacement, -1)
}
if !z.textIsRaw { if !z.textIsRaw {
s = unescape(s, false) s = unescape(s, false)
} }