go/usr/gri/pretty/scanner.go

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package Scanner
import Utils "utils"


export const (
	ILLEGAL = iota;

	IDENT;
	INT;
	FLOAT;
	STRING;
	COMMENT;
	EOF;

	ADD;
	SUB;
	MUL;
	QUO;
	REM;

	AND;
	OR;
	XOR;
	SHL;
	SHR;

	ADD_ASSIGN;
	SUB_ASSIGN;
	MUL_ASSIGN;
	QUO_ASSIGN;
	REM_ASSIGN;

	AND_ASSIGN;
	OR_ASSIGN;
	XOR_ASSIGN;
	SHL_ASSIGN;
	SHR_ASSIGN;

	LAND;
	LOR;
	ARROW;
	INC;
	DEC;

	EQL;
	NEQ;
	LSS;
	LEQ;
	GTR;
	GEQ;

	ASSIGN;
	DEFINE;
	NOT;
	ELLIPSIS;

	LPAREN;
	RPAREN;
	LBRACK;
	RBRACK;
	LBRACE;
	RBRACE;

	COMMA;
	SEMICOLON;
	COLON;
	PERIOD;

	// keywords
	KEYWORDS_BEG;
	BREAK;
	CASE;
	CHAN;
	CONST;
	CONTINUE;

	DEFAULT;
	ELSE;
	EXPORT;
	FALLTHROUGH;
	FOR;

	FUNC;
	GO;
	GOTO;
	IF;
	IMPORT;

	INTERFACE;
	MAP;
	PACKAGE;
	RANGE;
	RETURN;

	SELECT;
	STRUCT;
	SWITCH;
	TYPE;
	VAR;
	KEYWORDS_END;

	// AST use only
	EXPRSTAT;
)


export func TokenString(tok int) string {
	switch (tok) {
	case ILLEGAL: return "ILLEGAL";

	case IDENT: return "IDENT";
	case INT: return "INT";
	case FLOAT: return "FLOAT";
	case STRING: return "STRING";
	case COMMENT: return "COMMENT";
	case EOF: return "EOF";

	case ADD: return "+";
	case SUB: return "-";
	case MUL: return "*";
	case QUO: return "/";
	case REM: return "%";

	case AND: return "&";
	case OR: return "|";
	case XOR: return "^";
	case SHL: return "<<";
	case SHR: return ">>";

	case ADD_ASSIGN: return "+=";
	case SUB_ASSIGN: return "-=";
	case MUL_ASSIGN: return "+=";
	case QUO_ASSIGN: return "/=";
	case REM_ASSIGN: return "%=";

	case AND_ASSIGN: return "&=";
	case OR_ASSIGN: return "|=";
	case XOR_ASSIGN: return "^=";
	case SHL_ASSIGN: return "<<=";
	case SHR_ASSIGN: return ">>=";

	case LAND: return "&&";
	case LOR: return "||";
	case ARROW: return "<-";
	case INC: return "++";
	case DEC: return "--";

	case EQL: return "==";
	case NEQ: return "!=";
	case LSS: return "<";
	case LEQ: return "<=";
	case GTR: return ">";
	case GEQ: return ">=";

	case ASSIGN: return "=";
	case DEFINE: return ":=";
	case NOT: return "!";
	case ELLIPSIS: return "...";

	case LPAREN: return "(";
	case RPAREN: return ")";
	case LBRACK: return "[";
	case RBRACK: return "]";
	case LBRACE: return "LBRACE";
	case RBRACE: return "RBRACE";

	case COMMA: return ",";
	case SEMICOLON: return ";";
	case COLON: return ":";
	case PERIOD: return ".";

	case BREAK: return "break";
	case CASE: return "case";
	case CHAN: return "chan";
	case CONST: return "const";
	case CONTINUE: return "continue";

	case DEFAULT: return "default";
	case ELSE: return "else";
	case EXPORT: return "export";
	case FALLTHROUGH: return "fallthrough";
	case FOR: return "for";

	case FUNC: return "func";
	case GO: return "go";
	case GOTO: return "goto";
	case IF: return "if";
	case IMPORT: return "import";

	case INTERFACE: return "interface";
	case MAP: return "map";
	case PACKAGE: return "package";
	case RANGE: return "range";
	case RETURN: return "return";

	case SELECT: return "select";
	case STRUCT: return "struct";
	case SWITCH: return "switch";
	case TYPE: return "type";
	case VAR: return "var";

	case EXPRSTAT: return "EXPRSTAT";
	}

	return "token(" + Utils.IntToString(tok, 10) + ")";
}


export const (
	LowestPrec = -1;
	UnaryPrec = 7;
	HighestPrec = 8;
)


export func Precedence(tok int) int {
	switch tok {
	case COLON:
		return 0;
	case LOR:
		return 1;
	case LAND:
		return 2;
	case ARROW:
		return 3;
	case EQL, NEQ, LSS, LEQ, GTR, GEQ:
		return 4;
	case ADD, SUB, OR, XOR:
		return 5;
	case MUL, QUO, REM, SHL, SHR, AND:
		return 6;
	}
	return LowestPrec;
}


var Keywords *map [string] int;


func init() {
	Keywords = new(map [string] int);
	for i := KEYWORDS_BEG + 1; i < KEYWORDS_END; i++ {
		Keywords[TokenString(i)] = i;
	}
}


func is_whitespace(ch int) bool {
	return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t';
}


func is_letter(ch int) bool {
	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ;
}


func digit_val(ch int) int {
	if '0' <= ch && ch <= '9' {
		return ch - '0';
	}
	if 'a' <= ch && ch <= 'f' {
		return ch - 'a' + 10;
	}
	if 'A' <= ch && ch <= 'F' {
		return ch - 'A' + 10;
	}
	return 16;  // larger than any legal digit val
}


export type Scanner struct {
	// error handling
	filename string;  // error reporting only
	nerrors int;  // number of errors
	errpos int;  // last error position
	columns bool;  // if set, print columns in error messages

	// scanning
	src string;  // scanned source
	pos int;  // current reading position
	ch int;  // one char look-ahead
	chpos int;  // position of ch

	// testmode
	testmode bool;
	testpos int;
}


// Read the next Unicode char into S.ch.
// S.ch < 0 means end-of-file.
func (S *Scanner) Next() {
	if S.pos < len(S.src) {
		// assume ascii
		r, w := int(S.src[S.pos]), 1;
		if r > 0x80 {
			// wasn't ascii
			r, w = sys.stringtorune(S.src, S.pos);
		}
		S.ch = r;
		S.chpos = S.pos;
		S.pos += w;
	} else {
		S.ch = -1;  // eof
		S.chpos = len(S.src);
	}
/*
	const (
		Bit1 = 7;
		Bitx = 6;
		Bit2 = 5;
		Bit3 = 4;
		Bit4 = 3;

		T1 = (1 << (Bit1 + 1) - 1) ^ 0xFF;  // 0000 0000
		Tx = (1 << (Bitx + 1) - 1) ^ 0xFF;  // 1000 0000
		T2 = (1 << (Bit2 + 1) - 1) ^ 0xFF;  // 1100 0000
		T3 = (1 << (Bit3 + 1) - 1) ^ 0xFF;  // 1110 0000
		T4 = (1 << (Bit4 + 1) - 1) ^ 0xFF;  // 1111 0000

		Rune1 = 1 << (Bit1 + 0*Bitx) - 1;  // 0000 0000 0111 1111
		Rune2 = 1 << (Bit2 + 1*Bitx) - 1;  // 0000 0111 1111 1111
		Rune3 = 1 << (Bit3 + 2*Bitx) - 1;  // 1111 1111 1111 1111

		Maskx = 0x3F;  // 1 << Bitx - 1;  // 0011 1111
		Testx = 0xC0;  // Maskx ^ 0xFF;  // 1100 0000

		Bad	= 0xFFFD;  // Runeerror
	);

	src := S.src;
	lim := len(src);
	pos := S.pos;

	// 1-byte sequence
	// 0000-007F => T1
	if pos >= lim {
		S.ch = -1;  // end of file
		S.chpos = lim;
		return;
	}
	c0 := int(src[pos]);
	pos++;
	if c0 < Tx {
		S.ch = c0;
		S.chpos = S.pos;
		S.pos = pos;
		return;
	}

	// 2-byte sequence
	// 0080-07FF => T2 Tx
	if pos >= lim {
		goto bad;
	}
	c1 := int(src[pos]) ^ Tx;
	pos++;
	if c1 & Testx != 0 {
		goto bad;
	}
	if c0 < T3 {
		if c0 < T2 {
			goto bad;
		}
		r := (c0 << Bitx | c1) & Rune2;
		if  r <= Rune1 {
			goto bad;
		}
		S.ch = r;
		S.chpos = S.pos;
		S.pos = pos;
		return;
	}

	// 3-byte sequence
	// 0800-FFFF => T3 Tx Tx
	if pos >= lim {
		goto bad;
	}
	c2 := int(src[pos]) ^ Tx;
	pos++;
	if c2 & Testx != 0 {
		goto bad;
	}
	if c0 < T4 {
		r := (((c0 << Bitx | c1) << Bitx) | c2) & Rune3;
		if r <= Rune2 {
			goto bad;
		}
		S.ch = r;
		S.chpos = S.pos;
		S.pos = pos;
		return;
	}

	// bad encoding
bad:
	S.ch = Bad;
	S.chpos = S.pos;
	S.pos += 1;
	return;
*/
}


// Compute (line, column) information for a given source position.
func (S *Scanner) LineCol(pos int) (line, col int) {
	line = 1;
	lpos := 0;

	src := S.src;
	if pos > len(src) {
		pos = len(src);
	}

	for i := 0; i < pos; i++ {
		if src[i] == '\n' {
			line++;
			lpos = i;
		}
	}

	return line, pos - lpos;
}


func (S *Scanner) ErrorMsg(pos int, msg string) {
	print(S.filename);
	if pos >= 0 {
		// print position
		line, col := S.LineCol(pos);
		if S.columns {
			print(":", line, ":", col);
		} else {
			print(":", line);
		}
	}
	print(": ", msg, "\n");

	S.nerrors++;
	S.errpos = pos;

	if S.nerrors >= 10 {
		sys.exit(1);
	}
}


func (S *Scanner) Error(pos int, msg string) {
	// check for expected errors (test mode)
	if S.testpos < 0 || pos == S.testpos {
		// test mode:
		// S.testpos < 0:  // follow-up errors are expected and ignored
		// S.testpos == 0:  // an error is expected at S.testpos and ignored
		S.testpos = -1;
		return;
	}

	// only report errors that are sufficiently far away from the previous error
	// in the hope to avoid most follow-up errors
	const errdist = 20;
	delta := pos - S.errpos;  // may be negative!
	if delta < 0 {
		delta = -delta;
	}

	if delta > errdist || S.nerrors == 0 /* always report first error */ {
		S.ErrorMsg(pos, msg);
	}
}


func (S *Scanner) ExpectNoErrors() {
	// set the next expected error position to one after eof
	// (the eof position is a legal error position!)
	S.testpos = len(S.src) + 1;
}


func (S *Scanner) Open(filename, src string, columns, testmode bool) {
	S.filename = filename;
	S.nerrors = 0;
	S.errpos = 0;

	S.src = src;
	S.pos = 0;
	S.columns = columns;
	S.testmode = testmode;

	S.ExpectNoErrors();  // after setting S.src
	S.Next();  // after S.ExpectNoErrrors()
}


func CharString(ch int) string {
	s := string(ch);
	switch ch {
	case '\a': s = `\a`;
	case '\b': s = `\b`;
	case '\f': s = `\f`;
	case '\n': s = `\n`;
	case '\r': s = `\r`;
	case '\t': s = `\t`;
	case '\v': s = `\v`;
	case '\\': s = `\\`;
	case '\'': s = `\'`;
	}
	return "'" + s + "' (U+" + Utils.IntToString(ch, 16) + ")";
}


func (S *Scanner) Expect(ch int) {
	if S.ch != ch {
		S.Error(S.chpos, "expected " + CharString(ch) + ", found " + CharString(S.ch));
	}
	S.Next();  // make always progress
}


func (S *Scanner) SkipWhitespace() {
	for is_whitespace(S.ch) {
		S.Next();
	}
}


func (S *Scanner) ScanComment() string {
	// first '/' already consumed
	pos := S.chpos - 1;

	if S.ch == '/' {
		// comment
		for S.ch >= 0 {
			S.Next();
			if S.ch == '\n' {
				S.Next();
				goto exit;
			}
		}

	} else {
		/* comment */
		S.Expect('*');
		for S.ch >= 0 {
			ch := S.ch;
			S.Next();
			if ch == '*' && S.ch == '/' {
				S.Next();
				goto exit;
			}
		}
	}

	S.Error(pos, "comment not terminated");

exit:
	comment := S.src[pos : S.chpos];
	if S.testmode {
		// interpret ERROR and SYNC comments
		oldpos := -1;
		switch {
		case len(comment) >= 8 && comment[3 : 8] == "ERROR" :
			// an error is expected at the next token position
			oldpos = S.testpos;
			S.SkipWhitespace();
			S.testpos = S.chpos;
		case len(comment) >= 7 && comment[3 : 7] == "SYNC" :
			// scanning/parsing synchronized again - no (follow-up) errors expected
			oldpos = S.testpos;
			S.ExpectNoErrors();
		}

		if 0 <= oldpos && oldpos <= len(S.src) {
			// the previous error was not found
			S.ErrorMsg(oldpos, "ERROR not found");
		}
	}

	return comment;
}


func (S *Scanner) ScanIdentifier() (tok int, val string) {
	pos := S.chpos;
	for is_letter(S.ch) || digit_val(S.ch) < 10 {
		S.Next();
	}
	val = S.src[pos : S.chpos];

	var present bool;
	tok, present = Keywords[val];
	if !present {
		tok = IDENT;
	}

	return tok, val;
}


func (S *Scanner) ScanMantissa(base int) {
	for digit_val(S.ch) < base {
		S.Next();
	}
}


func (S *Scanner) ScanNumber(seen_decimal_point bool) (tok int, val string) {
	pos := S.chpos;
	tok = INT;

	if seen_decimal_point {
		tok = FLOAT;
		pos--;  // '.' is one byte
		S.ScanMantissa(10);
		goto exponent;
	}

	if S.ch == '0' {
		// int or float
		S.Next();
		if S.ch == 'x' || S.ch == 'X' {
			// hexadecimal int
			S.Next();
			S.ScanMantissa(16);
		} else {
			// octal int or float
			S.ScanMantissa(8);
			if digit_val(S.ch) < 10 || S.ch == '.' || S.ch == 'e' || S.ch == 'E' {
				// float
				tok = FLOAT;
				goto mantissa;
			}
			// octal int
		}
		goto exit;
	}

mantissa:
	// decimal int or float
	S.ScanMantissa(10);

	if S.ch == '.' {
		// float
		tok = FLOAT;
		S.Next();
		S.ScanMantissa(10)
	}

exponent:
	if S.ch == 'e' || S.ch == 'E' {
		// float
		tok = FLOAT;
		S.Next();
		if S.ch == '-' || S.ch == '+' {
			S.Next();
		}
		S.ScanMantissa(10);
	}

exit:
	return tok, S.src[pos : S.chpos];
}


func (S *Scanner) ScanDigits(n int, base int) {
	for digit_val(S.ch) < base {
		S.Next();
		n--;
	}
	if n > 0 {
		S.Error(S.chpos, "illegal char escape");
	}
}


func (S *Scanner) ScanEscape(quote int) string {
	// TODO: fix this routine

	ch := S.ch;
	pos := S.chpos;
	S.Next();
	switch (ch) {
	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
		return string(ch);

	case '0', '1', '2', '3', '4', '5', '6', '7':
		S.ScanDigits(3 - 1, 8);  // 1 char already read
		return "";  // TODO fix this

	case 'x':
		S.ScanDigits(2, 16);
		return "";  // TODO fix this

	case 'u':
		S.ScanDigits(4, 16);
		return "";  // TODO fix this

	case 'U':
		S.ScanDigits(8, 16);
		return "";  // TODO fix this

	default:
		// check for quote outside the switch for better generated code (eventually)
		if ch == quote {
			return string(quote);
		}
		S.Error(pos, "illegal char escape");
	}

	return "";  // TODO fix this
}


func (S *Scanner) ScanChar() string {
	// '\'' already consumed

	pos := S.chpos - 1;
	ch := S.ch;
	S.Next();
	if ch == '\\' {
		S.ScanEscape('\'');
	}

	S.Expect('\'');
	return S.src[pos : S.chpos];
}


func (S *Scanner) ScanString() string {
	// '"' already consumed

	pos := S.chpos - 1;
	for S.ch != '"' {
		ch := S.ch;
		S.Next();
		if ch == '\n' || ch < 0 {
			S.Error(pos, "string not terminated");
			break;
		}
		if ch == '\\' {
			S.ScanEscape('"');
		}
	}

	S.Next();
	return S.src[pos : S.chpos];
}


func (S *Scanner) ScanRawString() string {
	// '`' already consumed

	pos := S.chpos - 1;
	for S.ch != '`' {
		ch := S.ch;
		S.Next();
		if ch == '\n' || ch < 0 {
			S.Error(pos, "string not terminated");
			break;
		}
	}

	S.Next();
	return S.src[pos : S.chpos];
}


func (S *Scanner) Select2(tok0, tok1 int) int {
	if S.ch == '=' {
		S.Next();
		return tok1;
	}
	return tok0;
}


func (S *Scanner) Select3(tok0, tok1, ch2, tok2 int) int {
	if S.ch == '=' {
		S.Next();
		return tok1;
	}
	if S.ch == ch2 {
		S.Next();
		return tok2;
	}
	return tok0;
}


func (S *Scanner) Select4(tok0, tok1, ch2, tok2, tok3 int) int {
	if S.ch == '=' {
		S.Next();
		return tok1;
	}
	if S.ch == ch2 {
		S.Next();
		if S.ch == '=' {
			S.Next();
			return tok3;
		}
		return tok2;
	}
	return tok0;
}


func (S *Scanner) Scan() (pos, tok int, val string) {
	S.SkipWhitespace();

	ch := S.ch;
	pos = S.chpos;
	tok = ILLEGAL;

	switch {
	case is_letter(ch): tok, val = S.ScanIdentifier();
	case digit_val(ch) < 10: tok, val = S.ScanNumber(false);
	default:
		S.Next();  // always make progress
		switch ch {
		case -1: tok = EOF;
		case '"': tok, val = STRING, S.ScanString();
		case '\'': tok, val = INT, S.ScanChar();
		case '`': tok, val = STRING, S.ScanRawString();
		case ':': tok = S.Select2(COLON, DEFINE);
		case '.':
			if digit_val(S.ch) < 10 {
				tok, val = S.ScanNumber(true);
			} else if S.ch == '.' {
				S.Next();
				if S.ch == '.' {
					S.Next();
					tok = ELLIPSIS;
				}
			} else {
				tok = PERIOD;
			}
		case ',': tok = COMMA;
		case ';': tok = SEMICOLON;
		case '(': tok = LPAREN;
		case ')': tok = RPAREN;
		case '[': tok = LBRACK;
		case ']': tok = RBRACK;
		case '{': tok = LBRACE;
		case '}': tok = RBRACE;
		case '+': tok = S.Select3(ADD, ADD_ASSIGN, '+', INC);
		case '-': tok = S.Select3(SUB, SUB_ASSIGN, '-', DEC);
		case '*': tok = S.Select2(MUL, MUL_ASSIGN);
		case '/':
			if S.ch == '/' || S.ch == '*' {
				tok, val = COMMENT, S.ScanComment();
			} else {
				tok = S.Select2(QUO, QUO_ASSIGN);
			}
		case '%': tok = S.Select2(REM, REM_ASSIGN);
		case '^': tok = S.Select2(XOR, XOR_ASSIGN);
		case '<':
			if S.ch == '-' {
				S.Next();
				tok = ARROW;
			} else {
				tok = S.Select4(LSS, LEQ, '<', SHL, SHL_ASSIGN);
			}
		case '>': tok = S.Select4(GTR, GEQ, '>', SHR, SHR_ASSIGN);
		case '=': tok = S.Select2(ASSIGN, EQL);
		case '!': tok = S.Select2(NOT, NEQ);
		case '&': tok = S.Select3(AND, AND_ASSIGN, '&', LAND);
		case '|': tok = S.Select3(OR, OR_ASSIGN, '|', LOR);
		default:
			S.Error(pos, "illegal character " + CharString(ch));
			tok = ILLEGAL;
		}
	}

	return pos, tok, val;
}


export type Token struct {
	pos int;
	tok int;
	val string;
}


func (S *Scanner) TokenStream() *<-chan *Token {
	ch := new(chan *Token, 100);
	go func(S *Scanner, ch *chan <- *Token) {
		for {
			t := new(Token);
			t.pos, t.tok, t.val = S.Scan();
			ch <- t;
			if t.tok == EOF {
				break;
			}
		}
	}(S, ch);
	return ch;
}