introduce non-strict mode in xml parser,

good enough to parse some html.

in reader, add "comment" tag to collect
comment text.

do not allocate during Unmarshal unless pointer is nil.

R=r
DELTA=441  (416 added, 1 deleted, 24 changed)
OCL=35586
CL=35594
This commit is contained in:
Russ Cox 2009-10-11 23:51:46 -07:00
parent fcdba72d2a
commit d0aac0ace1
2 changed files with 437 additions and 22 deletions

View File

@ -162,14 +162,20 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
}
if pv, ok := val.(*reflect.PtrValue); ok {
zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
pv.PointTo(zv);
val = zv;
if pv.Get() == 0 {
zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
pv.PointTo(zv);
val = zv;
} else {
val = pv.Elem();
}
}
var (
data []byte;
saveData reflect.Value;
comment []byte;
saveComment reflect.Value;
sv *reflect.StructValue;
styp *reflect.StructType;
)
@ -251,7 +257,7 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
}
// Assign attributes.
// Also, determine whether we need to save character data.
// Also, determine whether we need to save character data or comments.
for i, n := 0, typ.NumField(); i < n; i++ {
f := typ.Field(i);
switch f.Tag {
@ -271,6 +277,11 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
}
strv.Set(val);
case "comment":
if saveComment == nil {
saveComment = sv.FieldByIndex(f.Index);
}
case "chardata":
if saveData == nil {
saveData = sv.FieldByIndex(f.Index);
@ -326,17 +337,27 @@ Loop:
if saveData != nil {
data = bytes.Add(data, t);
}
case Comment:
if saveComment != nil {
comment = bytes.Add(comment, t);
}
}
}
// Save accumulated character data
if saveData != nil {
switch t := saveData.(type) {
case *reflect.StringValue:
t.Set(string(data));
case *reflect.SliceValue:
t.Set(reflect.NewValue(data).(*reflect.SliceValue));
}
// Save accumulated character data and comments
switch t := saveData.(type) {
case *reflect.StringValue:
t.Set(string(data));
case *reflect.SliceValue:
t.Set(reflect.NewValue(data).(*reflect.SliceValue));
}
switch t := saveComment.(type) {
case *reflect.StringValue:
t.Set(string(comment));
case *reflect.SliceValue:
t.Set(reflect.NewValue(comment).(*reflect.SliceValue));
}
return nil;

View File

@ -111,12 +111,49 @@ type readByter interface {
// A Parser represents an XML parser reading a particular input stream.
// The parser assumes that its input is encoded in UTF-8.
type Parser struct {
// Strict defaults to true, enforcing the requirements
// of the XML specification.
// If set to false, the parser allows input containing common
// mistakes:
// * If an element is missing an end tag, the parser invents
// end tags as necessary to keep the return values from Token
// properly balanced.
// * In attribute values and character data, unknown or malformed
// character entities (sequences beginning with &) are left alone.
//
// Setting:
//
// p.Strict = false;
// p.AutoClose = HTMLAutoClose;
// p.Entity = HTMLEntity
//
// creates a parser that can handle typical HTML.
Strict bool;
// When Strict == false, AutoClose indicates a set of elements to
// consider closed immediately after they are opened, regardless
// of whether an end element is present.
AutoClose []string;
// Entity can be used to map non-standard entity names to string replacements.
// The parser behaves as if these standard mappings are present in the map,
// regardless of the actual map content:
//
// "lt": "<",
// "gt": ">",
// "amp": "&",
// "pos": "'",
// "quot": `"`,
//
Entity map[string]string;
r readByter;
buf bytes.Buffer;
stk *stack;
free *stack;
needClose bool;
toClose Name;
nextToken Token;
nextByte int;
ns map[string]string;
err os.Error;
@ -130,6 +167,7 @@ func NewParser(r io.Reader) *Parser {
ns: make(map[string]string),
nextByte: -1,
line: 1,
Strict: true,
};
// Get efficient byte at a time reader.
@ -169,9 +207,19 @@ func NewParser(r io.Reader) *Parser {
// it uses the prefix as the Space rather than report an error.
//
func (p *Parser) Token() (t Token, err os.Error) {
if t, err = p.RawToken(); err != nil {
if p.nextToken != nil {
t = p.nextToken;
p.nextToken = nil;
} else if t, err = p.RawToken(); err != nil {
return;
}
if !p.Strict {
if t1, ok := p.autoClose(t); ok {
p.nextToken = t;
t = t1;
}
}
switch t1 := t.(type) {
case StartElement:
// In XML name spaces, the translations listed in the
@ -201,7 +249,7 @@ func (p *Parser) Token() (t Token, err os.Error) {
case EndElement:
p.translate(&t1.Name, true);
if !p.popElement(t1.Name) {
if !p.popElement(&t1) {
return nil, p.err;
}
t = t1;
@ -286,13 +334,20 @@ func (p *Parser) pushNs(local string, url string, ok bool) {
// After popping the element, apply any undo records from
// the stack to restore the name translations that existed
// before we saw this element.
func (p *Parser) popElement(name Name) bool {
func (p *Parser) popElement(t *EndElement) bool {
s := p.pop();
name := t.Name;
switch {
case s == nil || s.kind != stkStart:
p.err = SyntaxError("unexpected end element </" + name.Local + ">");
return false;
case s.name.Local != name.Local:
if !p.Strict {
p.needClose = true;
p.toClose = t.Name;
t.Name = s.name;
return true;
}
p.err = SyntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">");
return false;
case s.name.Space != name.Space:
@ -311,6 +366,27 @@ func (p *Parser) popElement(name Name) bool {
return true;
}
// If the top element on the stack is autoclosing and
// t is not the end tag, invent the end tag.
func (p *Parser) autoClose(t Token) (Token, bool) {
if p.stk == nil || p.stk.kind != stkStart {
return nil, false;
}
name := strings.ToLower(p.stk.name.Local);
for _, s := range p.AutoClose {
if strings.ToLower(s) == name {
// This one should be auto closed if t doesn't close it.
et, ok := t.(EndElement);
if !ok || et.Name.Local != name {
return EndElement{p.stk.name}, true;
}
break;
}
}
return nil, false;
}
// RawToken is like Token but does not verify that
// start and end elements match and does not translate
// name space prefixes to their corresponding URLs.
@ -645,21 +721,38 @@ Input:
// Parsers are required to recognize lt, gt, amp, apos, and quot
// even if they have not been declared. That's all we allow.
var i int;
CharLoop:
for i = 0; i < len(p.tmp); i++ {
p.tmp[i], p.err = p.r.ReadByte();
if p.err != nil {
return nil;
}
if p.tmp[i] == ';' {
c := p.tmp[i];
if c == ';' {
break;
}
if 'a' <= c && c <= 'z' ||
'A' <= c && c <= 'Z' ||
'0' <= c && c <= '9' ||
c == '_' || c == '#' {
continue;
}
p.ungetc(c);
break;
}
s := string(p.tmp[0:i]);
if i >= len(p.tmp) {
if !p.Strict {
b0, b1 = 0, 0;
p.buf.WriteByte('&');
p.buf.Write(p.tmp[0:i]);
continue Input;
}
p.err = SyntaxError("character entity expression &" + s + "... too long");
return nil;
}
rune := -1;
var haveText bool;
var text string;
if i >= 2 && s[0] == '#' {
var n uint64;
var err os.Error;
@ -669,19 +762,28 @@ Input:
n, err = strconv.Btoui64(s[1:len(s)], 10);
}
if err == nil && n <= unicode.MaxRune {
rune = int(n);
text = string(n);
haveText = true;
}
} else {
if r, ok := entity[s]; ok {
rune = r;
text = string(r);
haveText = true;
} else {
text, haveText = p.Entity[s];
}
}
if rune < 0 {
if !haveText {
if !p.Strict {
b0, b1 = 0, 0;
p.buf.WriteByte('&');
p.buf.Write(p.tmp[0:i]);
continue Input;
}
p.err = SyntaxError("invalid character entity &" + s + ";");
return nil;
}
i = utf8.EncodeRune(rune, &p.tmp);
p.buf.Write(p.tmp[0:i]);
p.buf.Write(strings.Bytes(text));
b0, b1 = 0, 0;
continue Input;
}
@ -764,6 +866,7 @@ func (p *Parser) name() (s string, ok bool) {
func isNameByte(c byte) bool {
return 'A' <= c && c <= 'Z' ||
'a' <= c && c <= 'z' ||
'0' <= c && c <= '9' ||
c == '_' || c == ':' || c == '.' || c == '-';
}
@ -1079,3 +1182,294 @@ var second = []unicode.Range{
unicode.Range{0x309D, 0x309E, 1},
unicode.Range{0x30FC, 0x30FE, 1},
}
// HTMLEntity is an entity map containing translations for the
// standard HTML entity characters.
var HTMLEntity = htmlEntity
var htmlEntity = map[string]string {
/*
hget http://www.w3.org/TR/html4/sgml/entities.html |
ssam '
,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
,x v/^\&lt;!ENTITY/d
,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/ "\1": "\\u\2",/g
'
*/
"nbsp": "\u00A0",
"iexcl": "\u00A1",
"cent": "\u00A2",
"pound": "\u00A3",
"curren": "\u00A4",
"yen": "\u00A5",
"brvbar": "\u00A6",
"sect": "\u00A7",
"uml": "\u00A8",
"copy": "\u00A9",
"ordf": "\u00AA",
"laquo": "\u00AB",
"not": "\u00AC",
"shy": "\u00AD",
"reg": "\u00AE",
"macr": "\u00AF",
"deg": "\u00B0",
"plusmn": "\u00B1",
"sup2": "\u00B2",
"sup3": "\u00B3",
"acute": "\u00B4",
"micro": "\u00B5",
"para": "\u00B6",
"middot": "\u00B7",
"cedil": "\u00B8",
"sup1": "\u00B9",
"ordm": "\u00BA",
"raquo": "\u00BB",
"frac14": "\u00BC",
"frac12": "\u00BD",
"frac34": "\u00BE",
"iquest": "\u00BF",
"Agrave": "\u00C0",
"Aacute": "\u00C1",
"Acirc": "\u00C2",
"Atilde": "\u00C3",
"Auml": "\u00C4",
"Aring": "\u00C5",
"AElig": "\u00C6",
"Ccedil": "\u00C7",
"Egrave": "\u00C8",
"Eacute": "\u00C9",
"Ecirc": "\u00CA",
"Euml": "\u00CB",
"Igrave": "\u00CC",
"Iacute": "\u00CD",
"Icirc": "\u00CE",
"Iuml": "\u00CF",
"ETH": "\u00D0",
"Ntilde": "\u00D1",
"Ograve": "\u00D2",
"Oacute": "\u00D3",
"Ocirc": "\u00D4",
"Otilde": "\u00D5",
"Ouml": "\u00D6",
"times": "\u00D7",
"Oslash": "\u00D8",
"Ugrave": "\u00D9",
"Uacute": "\u00DA",
"Ucirc": "\u00DB",
"Uuml": "\u00DC",
"Yacute": "\u00DD",
"THORN": "\u00DE",
"szlig": "\u00DF",
"agrave": "\u00E0",
"aacute": "\u00E1",
"acirc": "\u00E2",
"atilde": "\u00E3",
"auml": "\u00E4",
"aring": "\u00E5",
"aelig": "\u00E6",
"ccedil": "\u00E7",
"egrave": "\u00E8",
"eacute": "\u00E9",
"ecirc": "\u00EA",
"euml": "\u00EB",
"igrave": "\u00EC",
"iacute": "\u00ED",
"icirc": "\u00EE",
"iuml": "\u00EF",
"eth": "\u00F0",
"ntilde": "\u00F1",
"ograve": "\u00F2",
"oacute": "\u00F3",
"ocirc": "\u00F4",
"otilde": "\u00F5",
"ouml": "\u00F6",
"divide": "\u00F7",
"oslash": "\u00F8",
"ugrave": "\u00F9",
"uacute": "\u00FA",
"ucirc": "\u00FB",
"uuml": "\u00FC",
"yacute": "\u00FD",
"thorn": "\u00FE",
"yuml": "\u00FF",
"fnof": "\u0192",
"Alpha": "\u0391",
"Beta": "\u0392",
"Gamma": "\u0393",
"Delta": "\u0394",
"Epsilon": "\u0395",
"Zeta": "\u0396",
"Eta": "\u0397",
"Theta": "\u0398",
"Iota": "\u0399",
"Kappa": "\u039A",
"Lambda": "\u039B",
"Mu": "\u039C",
"Nu": "\u039D",
"Xi": "\u039E",
"Omicron": "\u039F",
"Pi": "\u03A0",
"Rho": "\u03A1",
"Sigma": "\u03A3",
"Tau": "\u03A4",
"Upsilon": "\u03A5",
"Phi": "\u03A6",
"Chi": "\u03A7",
"Psi": "\u03A8",
"Omega": "\u03A9",
"alpha": "\u03B1",
"beta": "\u03B2",
"gamma": "\u03B3",
"delta": "\u03B4",
"epsilon": "\u03B5",
"zeta": "\u03B6",
"eta": "\u03B7",
"theta": "\u03B8",
"iota": "\u03B9",
"kappa": "\u03BA",
"lambda": "\u03BB",
"mu": "\u03BC",
"nu": "\u03BD",
"xi": "\u03BE",
"omicron": "\u03BF",
"pi": "\u03C0",
"rho": "\u03C1",
"sigmaf": "\u03C2",
"sigma": "\u03C3",
"tau": "\u03C4",
"upsilon": "\u03C5",
"phi": "\u03C6",
"chi": "\u03C7",
"psi": "\u03C8",
"omega": "\u03C9",
"thetasym": "\u03D1",
"upsih": "\u03D2",
"piv": "\u03D6",
"bull": "\u2022",
"hellip": "\u2026",
"prime": "\u2032",
"Prime": "\u2033",
"oline": "\u203E",
"frasl": "\u2044",
"weierp": "\u2118",
"image": "\u2111",
"real": "\u211C",
"trade": "\u2122",
"alefsym": "\u2135",
"larr": "\u2190",
"uarr": "\u2191",
"rarr": "\u2192",
"darr": "\u2193",
"harr": "\u2194",
"crarr": "\u21B5",
"lArr": "\u21D0",
"uArr": "\u21D1",
"rArr": "\u21D2",
"dArr": "\u21D3",
"hArr": "\u21D4",
"forall": "\u2200",
"part": "\u2202",
"exist": "\u2203",
"empty": "\u2205",
"nabla": "\u2207",
"isin": "\u2208",
"notin": "\u2209",
"ni": "\u220B",
"prod": "\u220F",
"sum": "\u2211",
"minus": "\u2212",
"lowast": "\u2217",
"radic": "\u221A",
"prop": "\u221D",
"infin": "\u221E",
"ang": "\u2220",
"and": "\u2227",
"or": "\u2228",
"cap": "\u2229",
"cup": "\u222A",
"int": "\u222B",
"there4": "\u2234",
"sim": "\u223C",
"cong": "\u2245",
"asymp": "\u2248",
"ne": "\u2260",
"equiv": "\u2261",
"le": "\u2264",
"ge": "\u2265",
"sub": "\u2282",
"sup": "\u2283",
"nsub": "\u2284",
"sube": "\u2286",
"supe": "\u2287",
"oplus": "\u2295",
"otimes": "\u2297",
"perp": "\u22A5",
"sdot": "\u22C5",
"lceil": "\u2308",
"rceil": "\u2309",
"lfloor": "\u230A",
"rfloor": "\u230B",
"lang": "\u2329",
"rang": "\u232A",
"loz": "\u25CA",
"spades": "\u2660",
"clubs": "\u2663",
"hearts": "\u2665",
"diams": "\u2666",
"quot": "\u0022",
"amp": "\u0026",
"lt": "\u003C",
"gt": "\u003E",
"OElig": "\u0152",
"oelig": "\u0153",
"Scaron": "\u0160",
"scaron": "\u0161",
"Yuml": "\u0178",
"circ": "\u02C6",
"tilde": "\u02DC",
"ensp": "\u2002",
"emsp": "\u2003",
"thinsp": "\u2009",
"zwnj": "\u200C",
"zwj": "\u200D",
"lrm": "\u200E",
"rlm": "\u200F",
"ndash": "\u2013",
"mdash": "\u2014",
"lsquo": "\u2018",
"rsquo": "\u2019",
"sbquo": "\u201A",
"ldquo": "\u201C",
"rdquo": "\u201D",
"bdquo": "\u201E",
"dagger": "\u2020",
"Dagger": "\u2021",
"permil": "\u2030",
"lsaquo": "\u2039",
"rsaquo": "\u203A",
"euro": "\u20AC",
}
// HTMLAutoClose is the set of HTML elements that
// should be considered to close automatically.
var HTMLAutoClose = htmlAutoClose
var htmlAutoClose = []string {
/*
hget http://www.w3.org/TR/html4/loose.dtd |
9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/ "\1",/p' | tr A-Z a-z
*/
"basefont",
"br",
"area",
"link",
"img",
"param",
"hr",
"input",
"col ",
"frame",
"isindex",
"base",
"meta",
}