// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package Scanner import ( "utf8"; "unicode"; "utils"; ) const ( ILLEGAL = iota; IDENT; INT; FLOAT; STRING; EOF; COMMENT; ADD; SUB; MUL; QUO; REM; AND; OR; XOR; SHL; SHR; ADD_ASSIGN; SUB_ASSIGN; MUL_ASSIGN; QUO_ASSIGN; REM_ASSIGN; AND_ASSIGN; OR_ASSIGN; XOR_ASSIGN; SHL_ASSIGN; SHR_ASSIGN; LAND; LOR; ARROW; INC; DEC; EQL; NEQ; LSS; LEQ; GTR; GEQ; ASSIGN; DEFINE; NOT; ELLIPSIS; LPAREN; RPAREN; LBRACK; RBRACK; LBRACE; RBRACE; COMMA; SEMICOLON; COLON; PERIOD; // keywords keywords_beg; BREAK; CASE; CHAN; CONST; CONTINUE; DEFAULT; DEFER; ELSE; FALLTHROUGH; FOR; FUNC; GO; GOTO; IF; IMPORT; INTERFACE; MAP; PACKAGE; RANGE; RETURN; SELECT; STRUCT; SWITCH; TYPE; VAR; keywords_end; ) func TokenString(tok int) string { switch tok { case ILLEGAL: return "ILLEGAL"; case IDENT: return "IDENT"; case INT: return "INT"; case FLOAT: return "FLOAT"; case STRING: return "STRING"; case EOF: return "EOF"; case COMMENT: return "COMMENT"; case ADD: return "+"; case SUB: return "-"; case MUL: return "*"; case QUO: return "/"; case REM: return "%"; case AND: return "&"; case OR: return "|"; case XOR: return "^"; case SHL: return "<<"; case SHR: return ">>"; case ADD_ASSIGN: return "+="; case SUB_ASSIGN: return "-="; case MUL_ASSIGN: return "+="; case QUO_ASSIGN: return "/="; case REM_ASSIGN: return "%="; case AND_ASSIGN: return "&="; case OR_ASSIGN: return "|="; case XOR_ASSIGN: return "^="; case SHL_ASSIGN: return "<<="; case SHR_ASSIGN: return ">>="; case LAND: return "&&"; case LOR: return "||"; case ARROW: return "<-"; case INC: return "++"; case DEC: return "--"; case EQL: return "=="; case NEQ: return "!="; case LSS: return "<"; case LEQ: return "<="; case GTR: return ">"; case GEQ: return ">="; case ASSIGN: return "="; case DEFINE: return ":="; case NOT: return "!"; case ELLIPSIS: return "..."; case LPAREN: return "("; case RPAREN: return ")"; case LBRACK: return "["; case RBRACK: return "]"; case LBRACE: return "{"; case RBRACE: return "}"; case COMMA: return ","; case SEMICOLON: return ";"; case COLON: return ":"; case PERIOD: return "."; case BREAK: return "break"; case CASE: return "case"; case CHAN: return "chan"; case CONST: return "const"; case CONTINUE: return "continue"; case DEFAULT: return "default"; case DEFER: return "defer"; case ELSE: return "else"; case FALLTHROUGH: return "fallthrough"; case FOR: return "for"; case FUNC: return "func"; case GO: return "go"; case GOTO: return "goto"; case IF: return "if"; case IMPORT: return "import"; case INTERFACE: return "interface"; case MAP: return "map"; case PACKAGE: return "package"; case RANGE: return "range"; case RETURN: return "return"; case SELECT: return "select"; case STRUCT: return "struct"; case SWITCH: return "switch"; case TYPE: return "type"; case VAR: return "var"; } return "token(" + Utils.IntToString(tok, 10) + ")"; } const ( LowestPrec = -1; UnaryPrec = 7; HighestPrec = 8; ) func Precedence(tok int) int { switch tok { case COLON: return 0; case LOR: return 1; case LAND: return 2; case ARROW: return 3; case EQL, NEQ, LSS, LEQ, GTR, GEQ: return 4; case ADD, SUB, OR, XOR: return 5; case MUL, QUO, REM, SHL, SHR, AND: return 6; } return LowestPrec; } var keywords map [string] int; func init() { keywords = make(map [string] int); for i := keywords_beg + 1; i < keywords_end; i++ { keywords[TokenString(i)] = i; } } func is_letter(ch int) bool { return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || // common case ch == '_' || unicode.IsLetter(ch); } func digit_val(ch int) int { if '0' <= ch && ch <= '9' { return ch - '0'; } if 'a' <= ch && ch <= 'f' { return ch - 'a' + 10; } if 'A' <= ch && ch <= 'F' { return ch - 'A' + 10; } return 16; // larger than any legal digit val } type ErrorHandler interface { Error(pos int, msg string); Warning(pos int, msg string); } type Scanner struct { // setup err ErrorHandler; src string; // source scan_comments bool; // scanning pos int; // current reading position ch int; // one char look-ahead chpos int; // position of ch linepos int; // position of beginning of line // testmode testmode bool; testpos int; } // Read the next Unicode char into S.ch. // S.ch < 0 means end-of-file. func (S *Scanner) next() { if S.pos < len(S.src) { // assume ascii r, w := int(S.src[S.pos]), 1; if r >= 0x80 { // not ascii r, w = utf8.DecodeRuneInString(S.src, S.pos); } S.ch = r; S.chpos = S.pos; S.pos += w; } else { S.ch = -1; // eof S.chpos = len(S.src); } } func (S *Scanner) Error(pos int, msg string) { // check for expected errors (test mode) if S.testpos < 0 || pos == S.testpos { // test mode: // S.testpos < 0: // follow-up errors are expected and ignored // S.testpos == 0: // an error is expected at S.testpos and ignored S.testpos = -1; return; } S.err.Error(pos, msg); } func (S *Scanner) expectNoErrors() { // set the next expected error position to one after eof // (the eof position is a legal error position!) S.testpos = len(S.src) + 1; } func (S *Scanner) Init(err ErrorHandler, src string, scan_comments, testmode bool) { S.err = err; S.src = src; S.scan_comments = scan_comments; S.pos = 0; S.linepos = 0; S.testmode = testmode; S.expectNoErrors(); // S.src must be set S.next(); // S.expectNoErrrors() must be called before } func charString(ch int) string { s := string(ch); switch ch { case '\a': s = `\a`; case '\b': s = `\b`; case '\f': s = `\f`; case '\n': s = `\n`; case '\r': s = `\r`; case '\t': s = `\t`; case '\v': s = `\v`; case '\\': s = `\\`; case '\'': s = `\'`; } return "'" + s + "' (U+" + Utils.IntToString(ch, 16) + ")"; } func (S *Scanner) expect(ch int) { if S.ch != ch { S.Error(S.chpos, "expected " + charString(ch) + ", found " + charString(S.ch)); } S.next(); // make always progress } func (S *Scanner) skipWhitespace() { for { switch S.ch { case '\t', '\r', ' ': // nothing to do case '\n': if S.scan_comments { return; } default: return; } S.next(); } panic("UNREACHABLE"); } func (S *Scanner) scanComment() string { // first '/' already consumed pos := S.chpos - 1; if S.ch == '/' { //-style comment S.next(); for S.ch >= 0 { S.next(); if S.ch == '\n' { // '\n' terminates comment but we do not include // it in the comment (otherwise we don't see the // start of a newline in skipWhitespace()). goto exit; } } } else { /*-style comment */ S.expect('*'); for S.ch >= 0 { ch := S.ch; S.next(); if ch == '*' && S.ch == '/' { S.next(); goto exit; } } } S.Error(pos, "comment not terminated"); exit: comment := S.src[pos : S.chpos]; if S.testmode { // interpret ERROR and SYNC comments oldpos := -1; switch { case len(comment) >= 8 && comment[3 : 8] == "ERROR" : // an error is expected at the next token position oldpos = S.testpos; S.skipWhitespace(); S.testpos = S.chpos; case len(comment) >= 7 && comment[3 : 7] == "SYNC" : // scanning/parsing synchronized again - no (follow-up) errors expected oldpos = S.testpos; S.expectNoErrors(); } if 0 <= oldpos && oldpos <= len(S.src) { // the previous error was not found S.Error(oldpos, "ERROR not found"); // TODO this should call ErrorMsg } } return comment; } func (S *Scanner) scanIdentifier() (tok int, val string) { pos := S.chpos; for is_letter(S.ch) || digit_val(S.ch) < 10 { S.next(); } val = S.src[pos : S.chpos]; var present bool; tok, present = keywords[val]; if !present { tok = IDENT; } return tok, val; } func (S *Scanner) scanMantissa(base int) { for digit_val(S.ch) < base { S.next(); } } func (S *Scanner) scanNumber(seen_decimal_point bool) (tok int, val string) { pos := S.chpos; tok = INT; if seen_decimal_point { tok = FLOAT; pos--; // '.' is one byte S.scanMantissa(10); goto exponent; } if S.ch == '0' { // int or float S.next(); if S.ch == 'x' || S.ch == 'X' { // hexadecimal int S.next(); S.scanMantissa(16); } else { // octal int or float S.scanMantissa(8); if digit_val(S.ch) < 10 || S.ch == '.' || S.ch == 'e' || S.ch == 'E' { // float tok = FLOAT; goto mantissa; } // octal int } goto exit; } mantissa: // decimal int or float S.scanMantissa(10); if S.ch == '.' { // float tok = FLOAT; S.next(); S.scanMantissa(10) } exponent: if S.ch == 'e' || S.ch == 'E' { // float tok = FLOAT; S.next(); if S.ch == '-' || S.ch == '+' { S.next(); } S.scanMantissa(10); } exit: return tok, S.src[pos : S.chpos]; } func (S *Scanner) scanDigits(n int, base int) { for digit_val(S.ch) < base { S.next(); n--; } if n > 0 { S.Error(S.chpos, "illegal char escape"); } } func (S *Scanner) scanEscape(quote int) string { // TODO: fix this routine ch := S.ch; pos := S.chpos; S.next(); switch ch { case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\': return string(ch); case '0', '1', '2', '3', '4', '5', '6', '7': S.scanDigits(3 - 1, 8); // 1 char already read return ""; // TODO fix this case 'x': S.scanDigits(2, 16); return ""; // TODO fix this case 'u': S.scanDigits(4, 16); return ""; // TODO fix this case 'U': S.scanDigits(8, 16); return ""; // TODO fix this default: // check for quote outside the switch for better generated code (eventually) if ch == quote { return string(quote); } S.Error(pos, "illegal char escape"); } return ""; // TODO fix this } func (S *Scanner) scanChar() string { // '\'' already consumed pos := S.chpos - 1; ch := S.ch; S.next(); if ch == '\\' { S.scanEscape('\''); } S.expect('\''); return S.src[pos : S.chpos]; } func (S *Scanner) scanString() string { // '"' already consumed pos := S.chpos - 1; for S.ch != '"' { ch := S.ch; S.next(); if ch == '\n' || ch < 0 { S.Error(pos, "string not terminated"); break; } if ch == '\\' { S.scanEscape('"'); } } S.next(); return S.src[pos : S.chpos]; } func (S *Scanner) scanRawString() string { // '`' already consumed pos := S.chpos - 1; for S.ch != '`' { ch := S.ch; S.next(); if ch == '\n' || ch < 0 { S.Error(pos, "string not terminated"); break; } } S.next(); return S.src[pos : S.chpos]; } func (S *Scanner) select2(tok0, tok1 int) int { if S.ch == '=' { S.next(); return tok1; } return tok0; } func (S *Scanner) select3(tok0, tok1, ch2, tok2 int) int { if S.ch == '=' { S.next(); return tok1; } if S.ch == ch2 { S.next(); return tok2; } return tok0; } func (S *Scanner) select4(tok0, tok1, ch2, tok2, tok3 int) int { if S.ch == '=' { S.next(); return tok1; } if S.ch == ch2 { S.next(); if S.ch == '=' { S.next(); return tok3; } return tok2; } return tok0; } func (S *Scanner) Scan() (pos, tok int, val string) { loop: S.skipWhitespace(); pos, tok = S.chpos, ILLEGAL; switch ch := S.ch; { case is_letter(ch): tok, val = S.scanIdentifier(); case digit_val(ch) < 10: tok, val = S.scanNumber(false); default: S.next(); // always make progress switch ch { case -1: tok = EOF; case '\n': tok, val = COMMENT, "\n"; case '"': tok, val = STRING, S.scanString(); case '\'': tok, val = INT, S.scanChar(); case '`': tok, val = STRING, S.scanRawString(); case ':': tok = S.select2(COLON, DEFINE); case '.': if digit_val(S.ch) < 10 { tok, val = S.scanNumber(true); } else if S.ch == '.' { S.next(); if S.ch == '.' { S.next(); tok = ELLIPSIS; } } else { tok = PERIOD; } case ',': tok = COMMA; case ';': tok = SEMICOLON; case '(': tok = LPAREN; case ')': tok = RPAREN; case '[': tok = LBRACK; case ']': tok = RBRACK; case '{': tok = LBRACE; case '}': tok = RBRACE; case '+': tok = S.select3(ADD, ADD_ASSIGN, '+', INC); case '-': tok = S.select3(SUB, SUB_ASSIGN, '-', DEC); case '*': tok = S.select2(MUL, MUL_ASSIGN); case '/': if S.ch == '/' || S.ch == '*' { tok, val = COMMENT, S.scanComment(); if !S.scan_comments { goto loop; } } else { tok = S.select2(QUO, QUO_ASSIGN); } case '%': tok = S.select2(REM, REM_ASSIGN); case '^': tok = S.select2(XOR, XOR_ASSIGN); case '<': if S.ch == '-' { S.next(); tok = ARROW; } else { tok = S.select4(LSS, LEQ, '<', SHL, SHL_ASSIGN); } case '>': tok = S.select4(GTR, GEQ, '>', SHR, SHR_ASSIGN); case '=': tok = S.select2(ASSIGN, EQL); case '!': tok = S.select2(NOT, NEQ); case '&': tok = S.select3(AND, AND_ASSIGN, '&', LAND); case '|': tok = S.select3(OR, OR_ASSIGN, '|', LOR); default: S.Error(pos, "illegal character " + charString(ch)); tok = ILLEGAL; } } return pos, tok, val; }