scanner.scm (4937B) [raw]
1 ;; scanner.scm -- tokenizes input 2 (import (chicken format)) 3 4 ;; Auto-generates the scaffolding getters and setters 5 ;; make-token, token-type, set-token-type!, etc 6 (define-record token type lexeme literal line) 7 (set-record-printer! token (lambda (t out) 8 (fprintf out "#,(token type:~S lex:~S lit:~S ln:~S)" 9 (token-type t) (token-lexeme t) (token-literal t) (token-line t)))) 10 11 (define (digit? c) 12 (and c (char<=? #\0 c) (char>=? #\9 c))) 13 14 (define (alpha? c) 15 (and c 16 (or 17 (eq? c #\_) 18 (and (char<=? #\a c) (char>=? #\z c)) 19 (and (char<=? #\A c) (char>=? #\Z c))))) 20 21 (define (get-keyword k) 22 (alist-ref k '(("and" . AND) 23 ("class" . CLASS) 24 ("else" . ELSE) 25 ("false" . FALSE) 26 ("for" . FOR) 27 ("fun" . FUN) 28 ("if" . IF) 29 ("nil" . NIL) 30 ("or" . OR) 31 ("print" . PRINT) 32 ("return". RETURN) 33 ("super" . SUPER) 34 ("this" . THIS) 35 ("true" . TRUE) 36 ("var" . VAR) 37 ("while" . WHILE)) equal?)) 38 39 (define (alnum? c) 40 (and c (or (alpha? c) (digit? c)))) 41 42 (define (scan src) 43 (define (peek i) 44 ;; safe string-ref 45 (if (< i (string-length src)) 46 (string-ref src i) 47 #f)) 48 49 (define (get-tokens s i line in) 50 ;; Gets all tokens after 'start', tracks state in i (current char), line, in 51 (define (tok-range type s2 i2) 52 ;; Helper to make a token, cons it to our list, and recurse with fresh state 53 (let ((text (substring src s2 (add1 i2)))) 54 (let ((tok (cond 55 ((eq? type 'STRING) (make-token type text (substring src (add1 s2) i2) line)) 56 ((eq? type 'NUMBER) (make-token type text (string->number text) line)) 57 ((eq? type 'IDENTIFIER) 58 (let ((k (get-keyword text))) 59 (if k 60 (make-token k text #f line) 61 (make-token 'IDENTIFIER text #f line)))) 62 (else (make-token type text #f line))))) 63 (cons tok (get-tokens (add1 i2) (add1 i2) line #f))))) 64 65 (define (tok type) 66 ;; helper to tokenize current span 67 (tok-range type s i)) 68 69 (define (skip . line2) 70 ;; Helper to skip this character range 71 (get-tokens (add1 i) (add1 i) (optional line2 line) in)) 72 73 (define (advance . line2) 74 ;; Helper to iterate; keeps start but increments range 75 (get-tokens s (add1 i) (optional line2 line) in)) 76 77 (let ((c (peek i)) (n (peek (add1 i)))) 78 (if (and (not in) (not c)) 79 (list (make-token 'EOF "" #f line)) 80 (cond 81 ((eq? in 'comment) (if (or (not c) (eq? #\newline c)) 82 (get-tokens (add1 i) (add1 i) (add1 line) #f) 83 (advance))) 84 ((eq? in 'string) 85 (cond 86 ((not c) (fname-err! (format "~A:unterminated string" line))) 87 ((eq? #\" c) (tok 'STRING)) 88 ((eq? #\newline c) (advance (add1 line))) 89 (else (advance)))) 90 ((eq? in 'number) 91 (cond 92 ((digit? c) (advance)) 93 ((eq? #\. c) (get-tokens s (add1 i) line 'decimal)) 94 (else (tok-range 'NUMBER s (sub1 i))))) 95 ((eq? in 'decimal) 96 (cond 97 ((digit? c) (advance)) 98 (else (tok-range 'NUMBER s (sub1 i))))) 99 ((eq? in 'alpha) 100 (cond 101 ((alnum? c) (advance)) 102 (else (tok-range 'IDENTIFIER s (sub1 i))))) 103 (else (cond 104 ((eq? #\( c) (tok 'LEFT_PAREN)) 105 ((eq? #\) c) (tok 'RIGHT_PAREN)) 106 ((eq? #\{ c) (tok 'LEFT_BRACE)) 107 ((eq? #\} c) (tok 'RIGHT_BRACE)) 108 ((eq? #\, c) (tok 'COMMA)) 109 ((eq? #\. c) (tok 'DOT)) 110 ((eq? #\- c) (tok 'MINUS)) 111 ((eq? #\+ c) (tok 'PLUS)) 112 ((eq? #\; c) (tok 'SEMICOLON)) 113 ((eq? #\* c) (tok 'STAR)) 114 ((eq? #\! c) (if (eq? #\= n) (tok-range 'BANG_EQUAL s (add1 i)) (tok 'BANG))) 115 ((eq? #\= c) (if (eq? #\= n) (tok-range 'EQUAL_EQUAL s (add1 i) ) (tok 'EQUAL))) 116 ((eq? #\< c) (if (eq? #\= n) (tok-range 'LESS_EQUAL s (add1 i) ) (tok 'LESS))) 117 ((eq? #\> c) (if (eq? #\= n) (tok-range 'GREATER_EQUAL s (add1 i) ) (tok 'GREATER))) 118 ((eq? #\/ c) (if (eq? #\/ n) (get-tokens s (add1 i) line 'comment) (tok 'SLASH))) 119 ((eq? #\" c) (get-tokens s (add1 i) line 'string)) 120 ((digit? c) (get-tokens s (add1 i) line 'number)) 121 ((alpha? c) (get-tokens s (add1 i) line 'alpha)) 122 ((eq? #\space c) (skip)) 123 ((eq? #\tab c) (skip)) 124 ((eq? #\newline c) (skip (add1 line))) 125 (else (fname-err! (format "~A:unexpected character: ~A" line c)) (skip)))))))) 126 127 (get-tokens 0 0 1 #f))