regexp: add support for matching text read from things that implement

ReadRune.  (If you have a Reader but not a RuneReader, use bufio.)

The matching code is a few percent slower but significantly cleaner.

R=rsc
CC=golang-dev
https://golang.org/cl/4125046
This commit is contained in:
Rob Pike 2011-02-03 13:58:40 -08:00
parent 63457d089e
commit 7db904c1f6
2 changed files with 236 additions and 73 deletions

View File

@ -6,6 +6,7 @@ package regexp
import ( import (
"fmt" "fmt"
"strings"
"testing" "testing"
) )
@ -191,6 +192,12 @@ func TestFindStringIndex(t *testing.T) {
} }
} }
func TestFindReaderIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
}
}
// Now come the simple All cases. // Now come the simple All cases.
func TestFindAll(t *testing.T) { func TestFindAll(t *testing.T) {
@ -387,6 +394,12 @@ func TestFindStringSubmatchIndex(t *testing.T) {
} }
} }
func TestFindReaderSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
}
}
// Now come the monster AllSubmatch cases. // Now come the monster AllSubmatch cases.
func TestFindAllSubmatch(t *testing.T) { func TestFindAllSubmatch(t *testing.T) {

View File

@ -54,6 +54,16 @@
// text of the match/submatch. If an index is negative, it means that // text of the match/submatch. If an index is negative, it means that
// subexpression did not match any string in the input. // subexpression did not match any string in the input.
// //
// There is also a subset of the methods that can be applied to text read
// from a RuneReader:
//
// MatchReader, FindReaderIndex, FindReaderSubmatchIndex
//
// This set may grow. Note that regular expression matches may need to
// examine text beyond the text returned by a match, so the methods that
// match text from a RuneReader may read arbitrarily far into the input
// before returning.
//
// (There are a few other methods that do not match this pattern.) // (There are a few other methods that do not match this pattern.)
// //
package regexp package regexp
@ -231,13 +241,13 @@ func (p *parser) error(err Error) {
panic(err) panic(err)
} }
const endOfFile = -1 const endOfText = -1
func (p *parser) c() int { return p.ch } func (p *parser) c() int { return p.ch }
func (p *parser) nextc() int { func (p *parser) nextc() int {
if p.pos >= len(p.re.expr) { if p.pos >= len(p.re.expr) {
p.ch = endOfFile p.ch = endOfText
} else { } else {
c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:]) c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
p.ch = c p.ch = c
@ -288,7 +298,7 @@ func (p *parser) checkBackslash() int {
if c == '\\' { if c == '\\' {
c = p.nextc() c = p.nextc()
switch { switch {
case c == endOfFile: case c == endOfText:
p.error(ErrExtraneousBackslash) p.error(ErrExtraneousBackslash)
case ispunct(c): case ispunct(c):
// c is as delivered // c is as delivered
@ -311,7 +321,7 @@ func (p *parser) charClass() *instr {
left := -1 left := -1
for { for {
switch c := p.c(); c { switch c := p.c(); c {
case ']', endOfFile: case ']', endOfText:
if left >= 0 { if left >= 0 {
p.error(ErrBadRange) p.error(ErrBadRange)
} }
@ -356,7 +366,7 @@ func (p *parser) charClass() *instr {
func (p *parser) term() (start, end *instr) { func (p *parser) term() (start, end *instr) {
switch c := p.c(); c { switch c := p.c(); c {
case '|', endOfFile: case '|', endOfText:
return nil, nil return nil, nil
case '*', '+', '?': case '*', '+', '?':
p.error(ErrBareClosure) p.error(ErrBareClosure)
@ -640,6 +650,9 @@ func (re *Regexp) NumSubexp() int { return re.nbra }
type matchArena struct { type matchArena struct {
head *matchVec head *matchVec
len int // length of match vector len int // length of match vector
pos int
atBOT bool // whether we're at beginning of text
atEOT bool // whether we're at end of text
} }
type matchVec struct { type matchVec struct {
@ -699,21 +712,21 @@ type state struct {
// Append new state to to-do list. Leftmost-longest wins so avoid // Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active. The matchVec will be inc-ref'ed // adding a state that's already active. The matchVec will be inc-ref'ed
// if it is assigned to a state. // if it is assigned to a state.
func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state { func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state {
switch inst.kind { switch inst.kind {
case iBOT: case iBOT:
if pos == 0 { if a.atBOT {
s = a.addState(s, inst.next, prefixed, match, pos, end) s = a.addState(s, inst.next, prefixed, match)
} }
return s return s
case iEOT: case iEOT:
if pos == end { if a.atEOT {
s = a.addState(s, inst.next, prefixed, match, pos, end) s = a.addState(s, inst.next, prefixed, match)
} }
return s return s
case iBra: case iBra:
match.m[inst.braNum] = pos match.m[inst.braNum] = a.pos
s = a.addState(s, inst.next, prefixed, match, pos, end) s = a.addState(s, inst.next, prefixed, match)
return s return s
} }
l := len(s) l := len(s)
@ -727,62 +740,157 @@ func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matc
s = append(s, state{inst, prefixed, match}) s = append(s, state{inst, prefixed, match})
match.ref++ match.ref++
if inst.kind == iAlt { if inst.kind == iAlt {
s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end) s = a.addState(s, inst.left, prefixed, a.copy(match))
// give other branch a copy of this match vector // give other branch a copy of this match vector
s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end) s = a.addState(s, inst.next, prefixed, a.copy(match))
} }
return s return s
} }
// Accepts either string or bytes - the logic is identical either way. // input abstracts different representations of the input text. It provides
// If bytes == nil, scan str. // one-character lookahead.
func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { type input interface {
step(pos int) (rune int, width int) // advance one rune
canCheckPrefix() bool // can we look ahead without losing info?
hasPrefix(re *Regexp) bool
index(re *Regexp, pos int) int
}
// inputString scans a string.
type inputString struct {
str string
}
func newInputString(str string) *inputString {
return &inputString{str: str}
}
func (i *inputString) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
}
return endOfText, 0
}
func (i *inputString) canCheckPrefix() bool {
return true
}
func (i *inputString) hasPrefix(re *Regexp) bool {
return strings.HasPrefix(i.str, re.prefix)
}
func (i *inputString) index(re *Regexp, pos int) int {
return strings.Index(i.str[pos:], re.prefix)
}
// inputBytes scans a byte slice.
type inputBytes struct {
str []byte
}
func newInputBytes(str []byte) *inputBytes {
return &inputBytes{str: str}
}
func (i *inputBytes) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRune(i.str[pos:len(i.str)])
}
return endOfText, 0
}
func (i *inputBytes) canCheckPrefix() bool {
return true
}
func (i *inputBytes) hasPrefix(re *Regexp) bool {
return bytes.HasPrefix(i.str, re.prefixBytes)
}
func (i *inputBytes) index(re *Regexp, pos int) int {
return bytes.Index(i.str[pos:], re.prefixBytes)
}
// inputReader scans a RuneReader.
type inputReader struct {
r io.RuneReader
atEOT bool
pos int
}
func newInputReader(r io.RuneReader) *inputReader {
return &inputReader{r: r}
}
func (i *inputReader) step(pos int) (int, int) {
if !i.atEOT && pos != i.pos {
return endOfText, 0
}
r, w, err := i.r.ReadRune()
if err != nil {
i.atEOT = true
return endOfText, 0
}
i.pos += w
return r, w
}
func (i *inputReader) canCheckPrefix() bool {
return false
}
func (i *inputReader) hasPrefix(re *Regexp) bool {
return false
}
func (i *inputReader) index(re *Regexp, pos int) int {
return -1
}
// Search match starting from pos bytes into the input.
func (re *Regexp) doExecute(i input, pos int) []int {
var s [2][]state var s [2][]state
s[0] = make([]state, 0, 10) s[0] = make([]state, 0, 10)
s[1] = make([]state, 0, 10) s[1] = make([]state, 0, 10)
in, out := 0, 1 in, out := 0, 1
var final state var final state
found := false found := false
end := len(str)
if bytestr != nil {
end = len(bytestr)
}
anchored := re.inst[0].next.kind == iBOT anchored := re.inst[0].next.kind == iBOT
if anchored && pos > 0 { if anchored && pos > 0 {
return nil return nil
} }
// fast check for initial plain substring // fast check for initial plain substring
if re.prefix != "" { if i.canCheckPrefix() && re.prefix != "" {
advance := 0 advance := 0
if anchored { if anchored {
if bytestr == nil { if !i.hasPrefix(re) {
if !strings.HasPrefix(str, re.prefix) {
return nil return nil
} }
} else { } else {
if !bytes.HasPrefix(bytestr, re.prefixBytes) { advance = i.index(re, pos)
return nil
}
}
} else {
if bytestr == nil {
advance = strings.Index(str[pos:], re.prefix)
} else {
advance = bytes.Index(bytestr[pos:], re.prefixBytes)
}
}
if advance == -1 { if advance == -1 {
return nil return nil
} }
}
pos += advance pos += advance
} }
arena := &matchArena{nil, 2 * (re.nbra + 1)} // We look one character ahead so we can match $, which checks whether
for startPos := pos; pos <= end; { // we are at EOT.
nextChar, nextWidth := i.step(pos)
arena := &matchArena{
len: 2 * (re.nbra + 1),
pos: pos,
atBOT: pos == 0,
atEOT: nextChar == endOfText,
}
for c, startPos := 0, pos; c != endOfText; {
if !found && (pos == startPos || !anchored) { if !found && (pos == startPos || !anchored) {
// prime the pump if we haven't seen a match yet // prime the pump if we haven't seen a match yet
match := arena.noMatch() match := arena.noMatch()
match.m[0] = pos match.m[0] = pos
s[out] = arena.addState(s[out], re.start.next, false, match, pos, end) s[out] = arena.addState(s[out], re.start.next, false, match)
arena.free(match) // if addState saved it, ref was incremented arena.free(match) // if addState saved it, ref was incremented
} else if len(s[out]) == 0 { } else if len(s[out]) == 0 {
// machine has completed // machine has completed
@ -795,35 +903,32 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
arena.free(state.match) arena.free(state.match)
} }
s[out] = old[0:0] // truncate state vector s[out] = old[0:0] // truncate state vector
charwidth := 1 c = nextChar
c := endOfFile thisPos := pos
if pos < end { pos += nextWidth
if bytestr == nil { nextChar, nextWidth = i.step(pos)
c, charwidth = utf8.DecodeRuneInString(str[pos:end]) arena.atEOT = nextChar == endOfText
} else { arena.atBOT = false
c, charwidth = utf8.DecodeRune(bytestr[pos:end]) arena.pos = pos
}
}
pos += charwidth
for _, st := range s[in] { for _, st := range s[in] {
switch st.inst.kind { switch st.inst.kind {
case iBOT: case iBOT:
case iEOT: case iEOT:
case iChar: case iChar:
if c == st.inst.char { if c == st.inst.char {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
} }
case iCharClass: case iCharClass:
if st.inst.cclass.matches(c) { if st.inst.cclass.matches(c) {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
} }
case iAny: case iAny:
if c != endOfFile { if c != endOfText {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
} }
case iNotNL: case iNotNL:
if c != endOfFile && c != '\n' { if c != endOfText && c != '\n' {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end) s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
} }
case iBra: case iBra:
case iAlt: case iAlt:
@ -831,13 +936,13 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
// choose leftmost longest // choose leftmost longest
if !found || // first if !found || // first
st.match.m[0] < final.match.m[0] || // leftmost st.match.m[0] < final.match.m[0] || // leftmost
(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest (st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest
if final.match != nil { if final.match != nil {
arena.free(final.match) arena.free(final.match)
} }
final = st final = st
final.match.ref++ final.match.ref++
final.match.m[1] = pos - charwidth final.match.m[1] = thisPos
} }
found = true found = true
default: default:
@ -874,14 +979,31 @@ func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
return string(c[:i]), true return string(c[:i]), true
} }
// MatchReader returns whether the Regexp matches the text read by the
// RuneReader. The return value is a boolean: true for match, false for no
// match.
func (re *Regexp) MatchReader(r io.RuneReader) bool {
return len(re.doExecute(newInputReader(r), 0)) > 0
}
// MatchString returns whether the Regexp matches the string s. // MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match. // The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 } func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 }
// Match returns whether the Regexp matches the byte slice b. // Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match. // The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 } func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 }
// MatchReader checks whether a textual regular expression matches the text
// read by the RuneReader. More complicated queries need to use Compile and
// the full Regexp interface.
func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
}
return re.MatchReader(r), nil
}
// MatchString checks whether a textual regular expression // MatchString checks whether a textual regular expression
// matches a string. More complicated queries need // matches a string. More complicated queries need
@ -921,7 +1043,7 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str
searchPos := 0 // position where we next look for a match searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer) buf := new(bytes.Buffer)
for searchPos <= len(src) { for searchPos <= len(src) {
a := re.doExecute(src, nil, searchPos) a := re.doExecute(newInputString(src), searchPos)
if len(a) == 0 { if len(a) == 0 {
break // no more matches break // no more matches
} }
@ -973,7 +1095,7 @@ func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
searchPos := 0 // position where we next look for a match searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer) buf := new(bytes.Buffer)
for searchPos <= len(src) { for searchPos <= len(src) {
a := re.doExecute("", src, searchPos) a := re.doExecute(newInputBytes(src), searchPos)
if len(a) == 0 { if len(a) == 0 {
break // no more matches break // no more matches
} }
@ -1038,7 +1160,13 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
} }
for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
matches := re.doExecute(s, b, pos) var in input
if b == nil {
in = newInputString(s)
} else {
in = newInputBytes(b)
}
matches := re.doExecute(in, pos)
if len(matches) == 0 { if len(matches) == 0 {
break break
} }
@ -1052,6 +1180,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
accept = false accept = false
} }
var width int var width int
// TODO: use step()
if b == nil { if b == nil {
_, width = utf8.DecodeRuneInString(s[pos:end]) _, width = utf8.DecodeRuneInString(s[pos:end])
} else { } else {
@ -1077,7 +1206,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
// Find returns a slice holding the text of the leftmost match in b of the regular expression. // Find returns a slice holding the text of the leftmost match in b of the regular expression.
// A return value of nil indicates no match. // A return value of nil indicates no match.
func (re *Regexp) Find(b []byte) []byte { func (re *Regexp) Find(b []byte) []byte {
a := re.doExecute("", b, 0) a := re.doExecute(newInputBytes(b), 0)
if a == nil { if a == nil {
return nil return nil
} }
@ -1089,7 +1218,7 @@ func (re *Regexp) Find(b []byte) []byte {
// b[loc[0]:loc[1]]. // b[loc[0]:loc[1]].
// A return value of nil indicates no match. // A return value of nil indicates no match.
func (re *Regexp) FindIndex(b []byte) (loc []int) { func (re *Regexp) FindIndex(b []byte) (loc []int) {
a := re.doExecute("", b, 0) a := re.doExecute(newInputBytes(b), 0)
if a == nil { if a == nil {
return nil return nil
} }
@ -1102,7 +1231,7 @@ func (re *Regexp) FindIndex(b []byte) (loc []int) {
// an empty string. Use FindStringIndex or FindStringSubmatch if it is // an empty string. Use FindStringIndex or FindStringSubmatch if it is
// necessary to distinguish these cases. // necessary to distinguish these cases.
func (re *Regexp) FindString(s string) string { func (re *Regexp) FindString(s string) string {
a := re.doExecute(s, nil, 0) a := re.doExecute(newInputString(s), 0)
if a == nil { if a == nil {
return "" return ""
} }
@ -1114,7 +1243,19 @@ func (re *Regexp) FindString(s string) string {
// itself is at s[loc[0]:loc[1]]. // itself is at s[loc[0]:loc[1]].
// A return value of nil indicates no match. // A return value of nil indicates no match.
func (re *Regexp) FindStringIndex(s string) []int { func (re *Regexp) FindStringIndex(s string) []int {
a := re.doExecute(s, nil, 0) a := re.doExecute(newInputString(s), 0)
if a == nil {
return nil
}
return a[0:2]
}
// FindReaderIndex returns a two-element slice of integers defining the
// location of the leftmost match of the regular expression in text read from
// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return
// value of nil indicates no match.
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
a := re.doExecute(newInputReader(r), 0)
if a == nil { if a == nil {
return nil return nil
} }
@ -1127,7 +1268,7 @@ func (re *Regexp) FindStringIndex(s string) []int {
// comment. // comment.
// A return value of nil indicates no match. // A return value of nil indicates no match.
func (re *Regexp) FindSubmatch(b []byte) [][]byte { func (re *Regexp) FindSubmatch(b []byte) [][]byte {
a := re.doExecute("", b, 0) a := re.doExecute(newInputBytes(b), 0)
if a == nil { if a == nil {
return nil return nil
} }
@ -1146,7 +1287,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte {
// in the package comment. // in the package comment.
// A return value of nil indicates no match. // A return value of nil indicates no match.
func (re *Regexp) FindSubmatchIndex(b []byte) []int { func (re *Regexp) FindSubmatchIndex(b []byte) []int {
return re.doExecute("", b, 0) return re.doExecute(newInputBytes(b), 0)
} }
// FindStringSubmatch returns a slice of strings holding the text of the // FindStringSubmatch returns a slice of strings holding the text of the
@ -1155,7 +1296,7 @@ func (re *Regexp) FindSubmatchIndex(b []byte) []int {
// package comment. // package comment.
// A return value of nil indicates no match. // A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatch(s string) []string { func (re *Regexp) FindStringSubmatch(s string) []string {
a := re.doExecute(s, nil, 0) a := re.doExecute(newInputString(s), 0)
if a == nil { if a == nil {
return nil return nil
} }
@ -1174,7 +1315,16 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
// 'Index' descriptions in the package comment. // 'Index' descriptions in the package comment.
// A return value of nil indicates no match. // A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatchIndex(s string) []int { func (re *Regexp) FindStringSubmatchIndex(s string) []int {
return re.doExecute(s, nil, 0) return re.doExecute(newInputString(s), 0)
}
// FindReaderSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression of text read by
// the RuneReader, and the matches, if any, of its subexpressions, as defined
// by the 'Submatch' and 'Index' descriptions in the package comment. A
// return value of nil indicates no match.
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
return re.doExecute(newInputReader(r), 0)
} }
const startSize = 10 // The size at which to start a slice in the 'All' routines. const startSize = 10 // The size at which to start a slice in the 'All' routines.