From b4cae4aee24363580c14e69a3b6bbcdf9b108f92 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Thu, 30 Jun 2011 10:26:22 -0400 Subject: [PATCH] exp/regexp/syntax: finish Regexp manipulation Except for the inevitable bug fixes, the Regexp code is done. R=sam.thorogood, r CC=golang-dev https://golang.org/cl/4635082 --- src/pkg/exp/regexp/syntax/parse.go | 364 +++++++++++++++++++-- src/pkg/exp/regexp/syntax/parse_test.go | 10 +- src/pkg/exp/regexp/syntax/regexp.go | 76 ++++- src/pkg/exp/regexp/syntax/simplify.go | 151 +++++++++ src/pkg/exp/regexp/syntax/simplify_test.go | 151 +++++++++ 5 files changed, 713 insertions(+), 39 deletions(-) create mode 100644 src/pkg/exp/regexp/syntax/simplify.go create mode 100644 src/pkg/exp/regexp/syntax/simplify_test.go diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go index ae40d5fc94..b6c91f7e1d 100644 --- a/src/pkg/exp/regexp/syntax/parse.go +++ b/src/pkg/exp/regexp/syntax/parse.go @@ -106,8 +106,6 @@ func (p *parser) reuse(re *Regexp) { // push pushes the regexp re onto the parse stack and returns the regexp. func (p *parser) push(re *Regexp) *Regexp { - // TODO: compute simple - if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { // Single rune. if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { @@ -250,7 +248,7 @@ func (p *parser) concat() *Regexp { return p.push(p.newRegexp(OpEmptyMatch)) } - return p.collapse(subs, OpConcat) + return p.push(p.collapse(subs, OpConcat)) } // alternate replaces the top of the stack (above the topmost '(') with its alternation. @@ -276,7 +274,7 @@ func (p *parser) alternate() *Regexp { return p.push(p.newRegexp(OpNoMatch)) } - return p.collapse(subs, OpAlternate) + return p.push(p.collapse(subs, OpAlternate)) } // cleanAlt cleans re for eventual inclusion in an alternation. @@ -302,13 +300,13 @@ func cleanAlt(re *Regexp) { } } -// collapse pushes the result of applying op to sub -// onto the stack. If sub contains op nodes, they all -// get flattened into a single node. -// sub points into p.stack so it cannot be kept. +// collapse returns the result of applying op to sub. +// If sub contains op nodes, they all get hoisted up +// so that there is never a concat of a concat or an +// alternate of an alternate. func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { if len(subs) == 1 { - return p.push(subs[0]) + return subs[0] } re := p.newRegexp(op) re.Sub = re.Sub0[:0] @@ -320,7 +318,295 @@ func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { re.Sub = append(re.Sub, sub) } } - return p.push(re) + if op == OpAlternate { + re.Sub = p.factor(re.Sub, re.Flags) + if len(re.Sub) == 1 { + old := re + re = re.Sub[0] + p.reuse(old) + } + } + return re +} + +// factor factors common prefixes from the alternation list sub. +// It returns a replacement list that reuses the same storage and +// frees (passes to p.reuse) any removed *Regexps. +// +// For example, +// ABC|ABD|AEF|BCX|BCY +// simplifies by literal prefix extraction to +// A(B(C|D)|EF)|BC(X|Y) +// which simplifies by character class introduction to +// A(B[CD]|EF)|BC[XY] +// +func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { + if len(sub) < 2 { + return sub + } + + // Round 1: Factor out common literal prefixes. + var str []int + var strflags Flags + start := 0 + out := sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with str as modified by strflags. + var istr []int + var iflags Flags + if i < len(sub) { + istr, iflags = p.leadingString(sub[i]) + if iflags == strflags { + same := 0 + for same < len(str) && same < len(istr) && str[same] == istr[same] { + same++ + } + if same > 0 { + // Matches at least one rune in current range. + // Keep going around. + str = str[:same] + continue + } + } + } + + // Found end of a run with common leading literal string: + // sub[start:i] all begin with str[0:len(str)], but sub[i] + // does not even begin with str[0]. + // + // Factor out common string and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := p.newRegexp(OpLiteral) + prefix.Flags = strflags + prefix.Rune = append(prefix.Rune[:0], str...) + + for j := start; j < i; j++ { + sub[j] = p.removeLeadingString(sub[j], len(str)) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + str = istr + strflags = iflags + } + sub = out + + // Round 2: Factor out common complex prefixes, + // just the first piece of each concatenation, + // whatever it is. This is good enough a lot of the time. + start = 0 + out = sub[:0] + var first *Regexp + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with str as modified by strflags. + var ifirst *Regexp + if i < len(sub) { + ifirst = p.leadingRegexp(sub[i]) + if first != nil && first.Equal(ifirst) { + continue + } + } + + // Found end of a run with common leading regexp: + // sub[start:i] all begin with first but sub[i] does not. + // + // Factor out common regexp and append factored expression to out. + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + // Just one: don't bother factoring. + out = append(out, sub[start]) + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + prefix := first + + for j := start; j < i; j++ { + reuse := j != start // prefix came from sub[start] + sub[j] = p.removeLeadingRegexp(sub[j], reuse) + } + suffix := p.collapse(sub[start:i], OpAlternate) // recurse + + re := p.newRegexp(OpConcat) + re.Sub = append(re.Sub[:0], prefix, suffix) + out = append(out, re) + } + + // Prepare for next iteration. + start = i + first = ifirst + } + sub = out + + // Round 3: Collapse runs of single literals into character classes. + start = 0 + out = sub[:0] + for i := 0; i <= len(sub); i++ { + // Invariant: the Regexps that were in sub[0:start] have been + // used or marked for reuse, and the slice space has been reused + // for out (len(out) <= start). + // + // Invariant: sub[start:i] consists of regexps that are either + // literal runes or character classes. + if i < len(sub) && isCharClass(sub[i]) { + continue + } + + // sub[i] is not a char or char class; + // emit char class for sub[start:i]... + if i == start { + // Nothing to do - run of length 0. + } else if i == start+1 { + out = append(out, sub[start]) + } else { + // Make new char class. + // Start with most complex regexp in sub[start]. + max := start + for j := start + 1; j < i; j++ { + if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { + max = j + } + } + sub[start], sub[max] = sub[max], sub[start] + + for j := start + 1; j < i; j++ { + mergeCharClass(sub[start], sub[j]) + p.reuse(sub[j]) + } + cleanAlt(sub[start]) + out = append(out, sub[start]) + } + + // ... and then emit sub[i]. + if i < len(sub) { + out = append(out, sub[i]) + } + start = i + 1 + } + sub = out + + // Round 4: Collapse runs of empty matches into a single empty match. + start = 0 + out = sub[:0] + for i := range sub { + if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { + continue + } + out = append(out, sub[i]) + } + sub = out + + return sub +} + +// leadingString returns the leading literal string that re begins with. +// The string refers to storage in re or its children. +func (p *parser) leadingString(re *Regexp) ([]int, Flags) { + if re.Op == OpConcat && len(re.Sub) > 0 { + re = re.Sub[0] + } + if re.Op != OpLiteral { + return nil, 0 + } + return re.Rune, re.Flags & FoldCase +} + +// removeLeadingString removes the first n leading runes +// from the beginning of re. It returns the replacement for re. +func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + // Removing a leading string in a concatenation + // might simplify the concatenation. + sub := re.Sub[0] + sub = p.removeLeadingString(sub, n) + re.Sub[0] = sub + if sub.Op == OpEmptyMatch { + p.reuse(sub) + switch len(re.Sub) { + case 0, 1: + // Impossible but handle. + re.Op = OpEmptyMatch + re.Sub = nil + case 2: + old := re + re = re.Sub[1] + p.reuse(old) + default: + copy(re.Sub, re.Sub[1:]) + re.Sub = re.Sub[:len(re.Sub)-1] + } + } + return re + } + + if re.Op == OpLiteral { + re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] + if len(re.Rune) == 0 { + re.Op = OpEmptyMatch + } + } + return re +} + +// leadingRegexp returns the leading regexp that re begins with. +// The regexp refers to storage in re or its children. +func (p *parser) leadingRegexp(re *Regexp) *Regexp { + if re.Op == OpEmptyMatch { + return nil + } + if re.Op == OpConcat && len(re.Sub) > 0 { + sub := re.Sub[0] + if sub.Op == OpEmptyMatch { + return nil + } + return sub + } + return re +} + +// removeLeadingRegexp removes the leading regexp in re. +// It returns the replacement for re. +// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. +func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { + if re.Op == OpConcat && len(re.Sub) > 0 { + if reuse { + p.reuse(re.Sub[0]) + } + re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] + switch len(re.Sub) { + case 0: + re.Op = OpEmptyMatch + re.Sub = nil + case 1: + old := re + re = re.Sub[0] + p.reuse(old) + } + return re + } + re.Op = OpEmptyMatch + return re } func literalRegexp(s string, flags Flags) *Regexp { @@ -752,6 +1038,36 @@ func (p *parser) parseVerticalBar() os.Error { return nil } +// mergeCharClass makes dst = dst|src. +// The caller must ensure that dst.Op >= src.Op, +// to reduce the amount of copying. +func mergeCharClass(dst, src *Regexp) { + switch dst.Op { + case OpAnyChar: + // src doesn't add anything. + case OpAnyCharNotNL: + // src might add \n + if matchRune(src, '\n') { + dst.Op = OpAnyChar + } + case OpCharClass: + // src is simpler, so either literal or char class + if src.Op == OpLiteral { + dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + } else { + dst.Rune = appendClass(dst.Rune, src.Rune) + } + case OpLiteral: + // both literal + if src.Rune[0] == dst.Rune[0] { + break + } + dst.Op = OpCharClass + dst.Rune = append(dst.Rune, dst.Rune[0]) + dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + } +} + // If the top of the stack is an element followed by an opVerticalBar // swapVerticalBar swaps the two and returns true. // Otherwise it returns false. @@ -767,30 +1083,7 @@ func (p *parser) swapVerticalBar() bool { re1, re3 = re3, re1 p.stack[n-3] = re3 } - switch re3.Op { - case OpAnyChar: - // re1 doesn't add anything. - case OpAnyCharNotNL: - // re1 might add \n - if matchRune(re1, '\n') { - re3.Op = OpAnyChar - } - case OpCharClass: - // re1 is simpler, so either literal or char class - if re1.Op == OpLiteral { - re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0]) - } else { - re3.Rune = appendClass(re3.Rune, re1.Rune) - } - case OpLiteral: - // both literal - if re1.Rune[0] == re3.Rune[0] { - break - } - re3.Op = OpCharClass - re3.Rune = append(re3.Rune, re3.Rune[0]) - re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0]) - } + mergeCharClass(re3, re1) p.reuse(re1) p.stack = p.stack[:n-1] return true @@ -1432,10 +1725,11 @@ func negateClass(r []int) []int { } nextLo = hi + 1 } + r = r[:w] if nextLo <= unicode.MaxRune { // It's possible for the negation to have one more // range - this one - than the original class, so use append. - r = append(r[:w], nextLo, unicode.MaxRune) + r = append(r, nextLo, unicode.MaxRune) } return r } diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go index 51856b613e..779b9afdea 100644 --- a/src/pkg/exp/regexp/syntax/parse_test.go +++ b/src/pkg/exp/regexp/syntax/parse_test.go @@ -39,8 +39,7 @@ var parseTests = []struct { {`a{2,3}?`, `nrep{2,3 lit{a}}`}, {`a{2,}?`, `nrep{2,-1 lit{a}}`}, {``, `emp{}`}, - // { `|`, `emp{}` }, // alt{emp{}emp{}} but got factored - {`|`, `alt{emp{}emp{}}`}, + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored {`|x|`, `alt{emp{}lit{x}emp{}}`}, {`.`, `dot{}`}, {`^`, `bol{}`}, @@ -64,6 +63,9 @@ var parseTests = []struct { {`\-`, `lit{-}`}, {`-`, `lit{-}`}, {`\_`, `lit{_}`}, + {`abc`, `str{abc}`}, + {`abc|def`, `alt{str{abc}str{def}}`}, + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, // Posix and Perl extensions {`[[:lower:]]`, `cc{0x61-0x7a}`}, @@ -156,6 +158,10 @@ var parseTests = []struct { // Strings {`abcde`, `str{abcde}`}, {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, + + // Factoring. + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, } const testFlags = MatchNL | PerlX | UnicodeGroups diff --git a/src/pkg/exp/regexp/syntax/regexp.go b/src/pkg/exp/regexp/syntax/regexp.go index 248ace503c..00a4addefc 100644 --- a/src/pkg/exp/regexp/syntax/regexp.go +++ b/src/pkg/exp/regexp/syntax/regexp.go @@ -60,6 +60,59 @@ const ( const opPseudo Op = 128 // where pseudo-ops start +// Equal returns true if x and y have identical structure. +func (x *Regexp) Equal(y *Regexp) bool { + if x == nil || y == nil { + return x == y + } + if x.Op != y.Op { + return false + } + switch x.Op { + case OpEndText: + // The parse flags remember whether this is \z or \Z. + if x.Flags&WasDollar != y.Flags&WasDollar { + return false + } + + case OpLiteral, OpCharClass: + if len(x.Rune) != len(y.Rune) { + return false + } + for i, r := range x.Rune { + if r != y.Rune[i] { + return false + } + } + + case OpAlternate, OpConcat: + if len(x.Sub) != len(y.Sub) { + return false + } + for i, sub := range x.Sub { + if !sub.Equal(y.Sub[i]) { + return false + } + } + + case OpStar, OpPlus, OpQuest: + if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpRepeat: + if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + + case OpCapture: + if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { + return false + } + } + return true +} + // writeRegexp writes the Perl syntax for the regular expression re to b. func writeRegexp(b *bytes.Buffer, re *Regexp) { switch re.Op { @@ -70,16 +123,24 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) { case OpEmptyMatch: b.WriteString(`(?:)`) case OpLiteral: + if re.Flags&FoldCase != 0 { + b.WriteString(`(?i:`) + } for _, r := range re.Rune { escape(b, r, false) } + if re.Flags&FoldCase != 0 { + b.WriteString(`)`) + } case OpCharClass: if len(re.Rune)%2 != 0 { b.WriteString(`[invalid char class]`) break } b.WriteRune('[') - if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { + if len(re.Rune) == 0 { + b.WriteString(`^\x00-\x{10FFFF}`) + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { // Contains 0 and MaxRune. Probably a negated class. // Print the gaps. b.WriteRune('^') @@ -126,7 +187,9 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) { } else { b.WriteRune('(') } - writeRegexp(b, re.Sub[0]) + if re.Sub[0].Op != OpEmptyMatch { + writeRegexp(b, re.Sub[0]) + } b.WriteRune(')') case OpStar, OpPlus, OpQuest, OpRepeat: if sub := re.Sub[0]; sub.Op > OpCapture { @@ -205,6 +268,15 @@ func escape(b *bytes.Buffer, r int, force bool) { case '\v': b.WriteString(`\v`) default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.Itob(r, 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } b.WriteString(`\x{`) b.WriteString(strconv.Itob(r, 16)) b.WriteString(`}`) diff --git a/src/pkg/exp/regexp/syntax/simplify.go b/src/pkg/exp/regexp/syntax/simplify.go new file mode 100644 index 0000000000..72390417bb --- /dev/null +++ b/src/pkg/exp/regexp/syntax/simplify.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +// Simplify returns a regexp equivalent to re but without counted repetitions +// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. +// The resulting regexp will execute correctly but its string representation +// will not produce the same parse tree, because capturing parentheses +// may have been duplicated or removed. For example, the simplified form +// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. +// The returned regexp may share structure with or be the original. +func (re *Regexp) Simplify() *Regexp { + if re == nil { + return nil + } + switch re.Op { + case OpCapture, OpConcat, OpAlternate: + // Simplify children, building new Regexp if children change. + nre := re + for i, sub := range re.Sub { + nsub := sub.Simplify() + if nre == re && nsub != sub { + // Start a copy. + nre = new(Regexp) + *nre = *re + nre.Rune = nil + nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) + } + if nre != re { + nre.Sub = append(nre.Sub, nsub) + } + } + return nre + + case OpStar, OpPlus, OpQuest: + sub := re.Sub[0].Simplify() + return simplify1(re.Op, re.Flags, sub, re) + + case OpRepeat: + // Special special case: x{0} matches the empty string + // and doesn't even need to consider x. + if re.Min == 0 && re.Max == 0 { + return &Regexp{Op: OpEmptyMatch} + } + + // The fun begins. + sub := re.Sub[0].Simplify() + + // x{n,} means at least n matches of x. + if re.Max == -1 { + // Special case: x{0,} is x*. + if re.Min == 0 { + return simplify1(OpStar, re.Flags, sub, nil) + } + + // Special case: x{1,} is x+. + if re.Min == 1 { + return simplify1(OpPlus, re.Flags, sub, nil) + } + + // General case: x{4,} is xxxx+. + nre := &Regexp{Op: OpConcat} + nre.Sub = nre.Sub0[:0] + for i := 0; i < re.Min-1; i++ { + nre.Sub = append(nre.Sub, sub) + } + nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) + return nre + } + + // Special case x{0} handled above. + + // Special case: x{1} is just x. + if re.Min == 1 && re.Max == 1 { + return sub + } + + // General case: x{n,m} means n copies of x and m copies of x? + // The machine will do less work if we nest the final m copies, + // so that x{2,5} = xx(x(x(x)?)?)? + + // Build leading prefix: xx. + var prefix *Regexp + if re.Min > 0 { + prefix = &Regexp{Op: OpConcat} + prefix.Sub = prefix.Sub0[:0] + for i := 0; i < re.Min; i++ { + prefix.Sub = append(prefix.Sub, sub) + } + } + + // Build and attach suffix: (x(x(x)?)?)? + if re.Max > re.Min { + suffix := simplify1(OpQuest, re.Flags, sub, nil) + for i := re.Min + 1; i < re.Max; i++ { + nre2 := &Regexp{Op: OpConcat} + nre2.Sub = append(nre2.Sub0[:0], sub, suffix) + suffix = simplify1(OpQuest, re.Flags, nre2, nil) + } + if prefix == nil { + return suffix + } + prefix.Sub = append(prefix.Sub, suffix) + } + if prefix != nil { + return prefix + } + + // Some degenerate case like min > max or min < max < 0. + // Handle as impossible match. + return &Regexp{Op: OpNoMatch} + } + + return re +} + +// simplify1 implements Simplify for the unary OpStar, +// OpPlus, and OpQuest operators. It returns the simple regexp +// equivalent to +// +// Regexp{Op: op, Flags: flags, Sub: {sub}} +// +// under the assumption that sub is already simple, and +// without first allocating that structure. If the regexp +// to be returned turns out to be equivalent to re, simplify1 +// returns re instead. +// +// simplify1 is factored out of Simplify because the implementation +// for other operators generates these unary expressions. +// Letting them call simplify1 makes sure the expressions they +// generate are simple. +func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if sub.Op == OpEmptyMatch { + return sub + } + // The operators are idempotent if the flags match. + if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { + return sub + } + if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { + return re + } + + re = &Regexp{Op: op, Flags: flags} + re.Sub = append(re.Sub0[:0], sub) + return re +} diff --git a/src/pkg/exp/regexp/syntax/simplify_test.go b/src/pkg/exp/regexp/syntax/simplify_test.go new file mode 100644 index 0000000000..c8cec21831 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/simplify_test.go @@ -0,0 +1,151 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import "testing" + +var simplifyTests = []struct { + Regexp string + Simple string +}{ + // Already-simple constructs + {`a`, `a`}, + {`ab`, `ab`}, + {`a|b`, `[a-b]`}, + {`ab|cd`, `ab|cd`}, + {`(ab)*`, `(ab)*`}, + {`(ab)+`, `(ab)+`}, + {`(ab)?`, `(ab)?`}, + {`.`, `.`}, + {`^`, `^`}, + {`$`, `$`}, + {`[ac]`, `[ac]`}, + {`[^ac]`, `[^ac]`}, + + // Posix character classes + {`[[:alnum:]]`, `[0-9A-Za-z]`}, + {`[[:alpha:]]`, `[A-Za-z]`}, + {`[[:blank:]]`, `[\t ]`}, + {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, + {`[[:digit:]]`, `[0-9]`}, + {`[[:graph:]]`, `[!-~]`}, + {`[[:lower:]]`, `[a-z]`}, + {`[[:print:]]`, `[ -~]`}, + {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, + {`[[:space:]]`, `[\t-\r ]`}, + {`[[:upper:]]`, `[A-Z]`}, + {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, + + // Perl character classes + {`\d`, `[0-9]`}, + {`\s`, `[\t-\n\f-\r ]`}, + {`\w`, `[0-9A-Z_a-z]`}, + {`\D`, `[^0-9]`}, + {`\S`, `[^\t-\n\f-\r ]`}, + {`\W`, `[^0-9A-Z_a-z]`}, + {`[\d]`, `[0-9]`}, + {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\w]`, `[0-9A-Z_a-z]`}, + {`[\D]`, `[^0-9]`}, + {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\W]`, `[^0-9A-Z_a-z]`}, + + // Posix repetitions + {`a{1}`, `a`}, + {`a{2}`, `aa`}, + {`a{5}`, `aaaaa`}, + {`a{0,1}`, `a?`}, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version with fewer parens. + {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? + {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? + {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,2}`, `(?:aa?)?`}, // (aa?)? + {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? + {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? + {`a{0,}`, `a*`}, + {`a{1,}`, `a+`}, + {`a{2,}`, `aa+`}, + {`a{5,}`, `aaaaa+`}, + + // Test that operators simplify their arguments. + {`(?:a{1,}){1,}`, `a+`}, + {`(a{1,}b{1,})`, `(a+b+)`}, + {`a{1,}|b{1,}`, `a+|b+`}, + {`(?:a{1,})*`, `(?:a+)*`}, + {`(?:a{1,})+`, `a+`}, + {`(?:a{1,})?`, `(?:a+)?`}, + {``, `(?:)`}, + {`a{0}`, `(?:)`}, + + // Character class simplification + {`[ab]`, `[a-b]`}, + {`[a-za-za-z]`, `[a-z]`}, + {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, + {`[ABCDEFGH]`, `[A-H]`}, + {`[AB-CD-EF-GH]`, `[A-H]`}, + {`[W-ZP-XE-R]`, `[E-Z]`}, + {`[a-ee-gg-m]`, `[a-m]`}, + {`[a-ea-ha-m]`, `[a-m]`}, + {`[a-ma-ha-e]`, `[a-m]`}, + {`[a-zA-Z0-9 -~]`, `[ -~]`}, + + // Empty character classes + {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, + + // Full character classes + {`[[:cntrl:][:^cntrl:]]`, `.`}, + + // Unicode case folding. + {`(?i)A`, `(?i:A)`}, + {`(?i)a`, `(?i:a)`}, + {`(?i)[A]`, `(?i:A)`}, + {`(?i)[a]`, `(?i:A)`}, + {`(?i)K`, `(?i:K)`}, + {`(?i)k`, `(?i:k)`}, + {`(?i)\x{212a}`, "(?i:\u212A)"}, + {`(?i)[K]`, "[Kk\u212A]"}, + {`(?i)[k]`, "[Kk\u212A]"}, + {`(?i)[\x{212a}]`, "[Kk\u212A]"}, + {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, + {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, + {`(?i)[\x00-\x{10FFFF}]`, `.`}, + + // Empty string as a regular expression. + // The empty string must be preserved inside parens in order + // to make submatches work right, so these tests are less + // interesting than they might otherwise be. String inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + {`(a|b|)`, `([a-b]|(?:))`}, + {`(|)`, `()`}, + {`a()`, `a()`}, + {`(()|())`, `(()|())`}, + {`(a|)`, `(a|(?:))`}, + {`ab()cd()`, `ab()cd()`}, + {`()`, `()`}, + {`()*`, `()*`}, + {`()+`, `()+`}, + {`()?`, `()?`}, + {`(){0}`, `(?:)`}, + {`(){1}`, `()`}, + {`(){1,}`, `()+`}, + {`(){0,2}`, `(?:()()?)?`}, +} + +func TestSimplify(t *testing.T) { + for _, tt := range simplifyTests { + re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) + if err != nil { + t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) + continue + } + s := re.Simplify().String() + if s != tt.Simple { + t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) + } + } +}