From 7bf30556ad7377eda417b53b0ed75263ef4fd6a3 Mon Sep 17 00:00:00 2001 From: Jordan Orelli Date: Sun, 14 Oct 2012 23:14:12 -0400 Subject: [PATCH] split lexer out into lex.go --- input.lisp | 10 ++ lex.go | 307 +++++++++++++++++++++++++++++++++++++++++++++++++ skeam.go | 330 +++-------------------------------------------------- 3 files changed, 331 insertions(+), 316 deletions(-) create mode 100644 lex.go diff --git a/input.lisp b/input.lisp index 1dd82a9..5e3ae9f 100644 --- a/input.lisp +++ b/input.lisp @@ -29,4 +29,14 @@ (set! x (+ x 1)) (* x 2)) +; ------------------------------------------------------------------------------ +; the following stuff comes directly from the norvig essay, instead of being +; contrived lexer tests. +; ------------------------------------------------------------------------------ + +; define a function and then execute it (begin (define r 3) (* 3.141592653 (* r r))) + +; same thing, alternative form without "begin" +(define area (lambda (r) (* 3.141592653 (* r r)))) +(area 3) diff --git a/lex.go b/lex.go new file mode 100644 index 0000000..de374e5 --- /dev/null +++ b/lex.go @@ -0,0 +1,307 @@ +package main + +import ( + "io" + "fmt" + "strings" +) + +type typ3 int + +const ( + invalid typ3 = iota + int3ger + symbol + openParen + closeParen + str1ng + fl0at +) + +func (t typ3) String() string { + switch t { + case int3ger: + return "integer" + case symbol: + return "symbol" + case openParen: + return "open_paren" + case closeParen: + return "close_paren" + case str1ng: + return "string" + case fl0at: + return "float" + } + panic("wtf") +} + +type token struct { + lexeme string + t typ3 +} + +type stateFn func(*lexer) (stateFn, error) + +type lexer struct { + io.RuneReader + cur []rune + depth int + out chan token +} + +// clears the current lexem buffer and emits a token of the given type. +// There's no sanity checking to make sure you don't emit some bullshit, so +// don't fuck it up. +func (l *lexer) emit(t typ3) { + debugPrint("emit " + string(l.cur)) + l.out <- token{lexeme: string(l.cur), t: t} + l.cur = nil +} + +// appends the rune to the current in-progress lexem +func (l *lexer) append(r rune) { + debugPrint(fmt.Sprintf("append %c\n", (r))) + if l.cur == nil { + l.cur = make([]rune, 0, 32) + } + l.cur = append(l.cur, r) +} + +func isDigit(r rune) bool { + switch r { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return true + } + return false +} + +func debugPrint(s string) { + if DEBUG { + fmt.Println(s) + } +} + +// lexes an open parenthesis +func lexOpenParen(l *lexer) (stateFn, error) { + debugPrint("-->lexOpenParen") + l.out <- token{"(", openParen} + l.depth++ + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + switch r { + case ' ', '\t', '\n', '\r': + return lexWhitespace, nil + case '(': + return lexOpenParen, nil + case ')': + return lexCloseParen, nil + case ';': + return lexComment, nil + } + if isDigit(r) { + l.append(r) + return lexInt, nil + } + l.append(r) + return lexSymbol, nil +} + +// lexes some whitespace in progress. Maybe this should be combined with root +// and the lexer shouldn't have a state. I think wehat I'm doing now is +// "wrong" but who honestly gives a shit. +func lexWhitespace(l *lexer) (stateFn, error) { + debugPrint("-->lexWhitespace") + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + switch r { + case ' ', '\t', '\n', '\r': + return lexWhitespace, nil + case '"': + return lexString, nil + case '(': + return lexOpenParen, nil + case ')': + return lexCloseParen, nil + case ';': + return lexComment, nil + } + if isDigit(r) { + l.append(r) + return lexInt, nil + } + l.append(r) + return lexSymbol, nil +} + +func lexString(l *lexer) (stateFn, error) { + debugPrint("-->lexString") + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + switch r { + case '"': + l.emit(str1ng) + return lexWhitespace, nil + case '\\': + return lexStringEsc, nil + } + l.append(r) + return lexString, nil +} + +// lex the character *after* the string escape character \ +func lexStringEsc(l *lexer) (stateFn, error) { + debugPrint("-->lexStringEsc") + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + l.append(r) + return lexString, nil +} + +// lex an integer. Once we're on an integer, the only valid characters are +// whitespace, close paren, a period to indicate we want a float, or more +// digits. Everything else is crap. +func lexInt(l *lexer) (stateFn, error) { + debugPrint("-->lexInt") + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + switch r { + case ' ', '\t', '\n', '\r': + l.emit(int3ger) + return lexWhitespace, nil + case '.': + l.append(r) + return lexFloat, nil + case ')': + l.emit(int3ger) + return lexCloseParen, nil + case ';': + l.emit(int3ger) + return lexComment, nil + } + if isDigit(r) { + l.append(r) + return lexInt, nil + } + return nil, fmt.Errorf("unexpected rune in lexInt: %c", r) +} + +// once we're in a float, the only valid values are digits, whitespace or close +// paren. +func lexFloat(l *lexer) (stateFn, error) { + debugPrint("-->lexFloat") + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + + switch r { + case ' ', '\t', '\n', '\r': + l.emit(fl0at) + return lexWhitespace, nil + case ')': + l.emit(fl0at) + return lexCloseParen, nil + case ';': + l.emit(fl0at) + return lexComment, nil + } + if isDigit(r) { + l.append(r) + return lexFloat, nil + } + return nil, fmt.Errorf("unexpected run in lexFloat: %c", r) +} + +// lexes a symbol in progress +func lexSymbol(l *lexer) (stateFn, error) { + debugPrint("-->lexSymbol") + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + + switch r { + case ' ', '\t', '\n', '\r': + debugPrint("ending lexSymbol on whitespace") + l.emit(symbol) + return lexWhitespace, nil + case ')': + l.emit(symbol) + return lexCloseParen, nil + case ';': + l.emit(symbol) + return lexComment, nil + default: + l.append(r) + return lexSymbol, nil + } + panic("not reached") +} + +// lex a close parenthesis +func lexCloseParen(l *lexer) (stateFn, error) { + debugPrint("-->lexCloseParen") + l.out <- token{")", closeParen} + l.depth-- + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + switch r { + case ' ', '\t', '\n', '\r': + return lexWhitespace, nil + case ')': + return lexCloseParen, nil + case ';': + return lexComment, nil + } + return nil, fmt.Errorf("unimplemented") +} + +// lexes a comment +func lexComment(l *lexer) (stateFn, error) { + debugPrint("-->lexComment") + r, _, err := l.ReadRune() + if err != nil { + return nil, err + } + switch r { + case '\n', '\r': + return lexWhitespace, nil + } + return lexComment, nil +} + +// lexes some lispy input from an io.Reader, emiting tokens on chan c. The +// channel is closed when the input reaches EOF, signaling that there are no +// new tokens. +func lex(input io.RuneReader, c chan token) { + defer close(c) + l := &lexer{input, nil, 0, c} + + var err error + f := stateFn(lexWhitespace) + for err == nil { + f, err = f(l) + } + if err != io.EOF { + fmt.Println(err) + } + if l.depth != 0 { + fmt.Println("error: unbalanced parenthesis") + } +} +func lexs(input string, c chan token) { + lex(strings.NewReader(input), c) +} diff --git a/skeam.go b/skeam.go index 8b46da3..9b93e7a 100644 --- a/skeam.go +++ b/skeam.go @@ -5,308 +5,10 @@ import ( "fmt" "io" "os" - "strings" ) var DEBUG = false -type typ3 int - -const ( - invalid typ3 = iota - int3ger - symbol - openParen - closeParen - str1ng - fl0at -) - -func (t typ3) String() string { - switch t { - case int3ger: - return "integer" - case symbol: - return "symbol" - case openParen: - return "open_paren" - case closeParen: - return "close_paren" - case str1ng: - return "string" - case fl0at: - return "float" - } - panic("wtf") -} - -type token struct { - lexeme string - t typ3 -} - -type stateFn func(*lexer) (stateFn, error) - -type lexer struct { - io.RuneReader - cur []rune - depth int - out chan token -} - -// clears the current lexem buffer and emits a token of the given type. -// There's no sanity checking to make sure you don't emit some bullshit, so -// don't fuck it up. -func (l *lexer) emit(t typ3) { - debugPrint("emit " + string(l.cur)) - l.out <- token{lexeme: string(l.cur), t: t} - l.cur = nil -} - -// appends the rune to the current in-progress lexem -func (l *lexer) append(r rune) { - debugPrint(fmt.Sprintf("append %c\n", (r))) - if l.cur == nil { - l.cur = make([]rune, 0, 32) - } - l.cur = append(l.cur, r) -} - -func isDigit(r rune) bool { - switch r { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - return true - } - return false -} - -func debugPrint(s string) { - if DEBUG { - fmt.Println(s) - } -} - -// lexes an open parenthesis -func lexOpenParen(l *lexer) (stateFn, error) { - debugPrint("-->lexOpenParen") - l.out <- token{"(", openParen} - l.depth++ - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - switch r { - case ' ', '\t', '\n', '\r': - return lexWhitespace, nil - case '(': - return lexOpenParen, nil - case ')': - return lexCloseParen, nil - case ';': - return lexComment, nil - } - if isDigit(r) { - l.append(r) - return lexInt, nil - } - l.append(r) - return lexSymbol, nil -} - -// lexes some whitespace in progress. Maybe this should be combined with root -// and the lexer shouldn't have a state. I think wehat I'm doing now is -// "wrong" but who honestly gives a shit. -func lexWhitespace(l *lexer) (stateFn, error) { - debugPrint("-->lexWhitespace") - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - switch r { - case ' ', '\t', '\n', '\r': - return lexWhitespace, nil - case '"': - return lexString, nil - case '(': - return lexOpenParen, nil - case ')': - return lexCloseParen, nil - case ';': - return lexComment, nil - } - if isDigit(r) { - l.append(r) - return lexInt, nil - } - l.append(r) - return lexSymbol, nil -} - -func lexString(l *lexer) (stateFn, error) { - debugPrint("-->lexString") - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - switch r { - case '"': - l.emit(str1ng) - return lexWhitespace, nil - case '\\': - return lexStringEsc, nil - } - l.append(r) - return lexString, nil -} - -// lex the character *after* the string escape character \ -func lexStringEsc(l *lexer) (stateFn, error) { - debugPrint("-->lexStringEsc") - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - l.append(r) - return lexString, nil -} - -// lex an integer. Once we're on an integer, the only valid characters are -// whitespace, close paren, a period to indicate we want a float, or more -// digits. Everything else is crap. -func lexInt(l *lexer) (stateFn, error) { - debugPrint("-->lexInt") - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - switch r { - case ' ', '\t', '\n', '\r': - l.emit(int3ger) - return lexWhitespace, nil - case '.': - l.append(r) - return lexFloat, nil - case ')': - l.emit(int3ger) - return lexCloseParen, nil - case ';': - l.emit(int3ger) - return lexComment, nil - } - if isDigit(r) { - l.append(r) - return lexInt, nil - } - return nil, fmt.Errorf("unexpected rune in lexInt: %c", r) -} - -// once we're in a float, the only valid values are digits, whitespace or close -// paren. -func lexFloat(l *lexer) (stateFn, error) { - debugPrint("-->lexFloat") - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - - switch r { - case ' ', '\t', '\n', '\r': - l.emit(fl0at) - return lexWhitespace, nil - case ')': - l.emit(fl0at) - return lexCloseParen, nil - case ';': - l.emit(fl0at) - return lexComment, nil - } - if isDigit(r) { - l.append(r) - return lexFloat, nil - } - return nil, fmt.Errorf("unexpected run in lexFloat: %c", r) -} - -// lexes a symbol in progress -func lexSymbol(l *lexer) (stateFn, error) { - debugPrint("-->lexSymbol") - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - - switch r { - case ' ', '\t', '\n', '\r': - debugPrint("ending lexSymbol on whitespace") - l.emit(symbol) - return lexWhitespace, nil - case ')': - l.emit(symbol) - return lexCloseParen, nil - case ';': - l.emit(symbol) - return lexComment, nil - default: - l.append(r) - return lexSymbol, nil - } - panic("not reached") -} - -// lex a close parenthesis -func lexCloseParen(l *lexer) (stateFn, error) { - debugPrint("-->lexCloseParen") - l.out <- token{")", closeParen} - l.depth-- - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - switch r { - case ' ', '\t', '\n', '\r': - return lexWhitespace, nil - case ')': - return lexCloseParen, nil - case ';': - return lexComment, nil - } - return nil, fmt.Errorf("unimplemented") -} - -// lexes a comment -func lexComment(l *lexer) (stateFn, error) { - debugPrint("-->lexComment") - r, _, err := l.ReadRune() - if err != nil { - return nil, err - } - switch r { - case '\n', '\r': - return lexWhitespace, nil - } - return lexComment, nil -} - -// lexes some lispy input from an io.Reader, emiting tokens on chan c. The -// channel is closed when the input reaches EOF, signaling that there are no -// new tokens. -func lex(input io.RuneReader, c chan token) { - defer close(c) - l := &lexer{input, nil, 0, c} - - var err error - f := stateFn(lexWhitespace) - for err == nil { - f, err = f(l) - } - if err != io.EOF { - fmt.Println(err) - } - if l.depth != 0 { - fmt.Println("error: unbalanced parenthesis") - } -} - func args() { filename := os.Args[1] f, err := os.Open(filename) @@ -324,10 +26,6 @@ func args() { } } -func lexs(input string, c chan token) { - lex(strings.NewReader(input), c) -} - func main() { if len(os.Args) > 1 { args() @@ -341,21 +39,21 @@ func main() { if prefix { fmt.Println("(prefix)") } - switch err { - case nil: - break - case io.EOF: - fmt.Print("\n") - return - default: + switch err { + case nil: + break + case io.EOF: + fmt.Print("\n") + return + default: fmt.Println("error: ", err) - continue - } + continue + } - c := make(chan token, 32) - go lexs(string(line) + "\n", c) - for s := range c { - fmt.Printf("%11s %s\n", s.t, s.lexeme) - } + c := make(chan token, 32) + go lexs(string(line)+"\n", c) + for s := range c { + fmt.Printf("%11s %s\n", s.t, s.lexeme) + } } }