;; ;; Lexer combinator library. ;; ;; Based on the SML lexer generator by Thant Tessman. ;; ;; Copyright 2009-2010 Ivan Raikov and the Okinawa Institute of ;; Science and Technology. ;; ;; ;; This program is free software: you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation, either version 3 of the ;; License, or (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, but ;; WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; ;; A full copy of the GPL license can be found at ;; . (module lexgen ( tok seq star bar redo try pass pos opt char set range lst lit bind drop rebind longest lex stream-unfold ) (import scheme chicken data-structures srfi-14) (require-library srfi-1) (import (only srfi-1 first second filter-map fold concatenate every lset<= )) ;; ;; This is a lexer generator comprised in its core of five small ;; functions. The programmer assembles these functions into regular ;; expression pattern-matching functions. ;; ;; The idea is that a pattern matcher function takes a list of ;; streams, and returns a new list of streams advanced by every ;; combination allowed by the pattern matcher function. ;; ;; A stream is a list that can take one of two forms: ;; ;; 1) A list of two elements: the first element is a list of ;; elements consumed by the pattern matcher; the second element is a ;; list of characters not yet consumed. E.g., the list ;; ;; ((a) (b c d e)) ;; ;; represents a stream that contains the consumed character a, ;; and the unconsumed characters b c d e. ;; ;; 2) A list of three elements: the first two elements are as ;; before, but the third element is a procedure that is applied to ;; the tail of the unconsumed list, in order to obtain the next ;; character. E.g., the list: ;; ;; ((a) (b ) ;; ;; represents a stream that contains the consumed character a, the ;; unconsumed character b, and an input port to read subsequent ;; character from; and a procedure that reads one character from the ;; input port, and returns it along with the modified port. Note ;; that the use of side-effecting structures such as ports will lead ;; to erroneous results with backtracking parsers. ;; ;; Also note that the number of streams returned by the function ;; typically won't match the number of streams passed in. If the ;; pattern doesn't match at all, the empty list is returned. ;; ;; input stream comparison procedures (define (lst<= pred xx yy) (let ((r0 (let recur ((xx xx) (yy yy)) (cond ((null? xx) #t) ((null? yy) #f) (else (if (pred (car xx) (car yy)) (recur (cdr xx) (cdr yy)) #f)))))) r0)) (define (stream<= x y) (cond ((null? x) #t) ((null? y) #f) (else (and (lst<= eq? (car x) (car y)) (eq? (safe-car (cadr x)) (safe-car (cadr y))))))) (define (safe-car x) (and (pair? x) (car x))) ;; 'tok' builds a pattern matcher function that applies procedure p to ;; a given token and an input character. If the procedure returns a ;; true value, that value is prepended to the list of consumed ;; elements, and the input character is removed from the list of input ;; elements. (define (tok t p ) (let ((f (lambda (s) (let ((l (length s))) (cond ((fx= l 2) (let ((c (car s)) (u (cadr s))) (and (pair? u) (let ((ans (p t (car u)))) (and ans (list (cons ans c) (cdr u))))))) ((fx= l 3) (let ((c (car s)) (u (cadr s)) (succ (caddr s))) (and (pair? u) (let ((ans (p t (car u)))) (and ans (cons (cons ans c) (succ (cdr u)))))))) (else #f)))))) (lambda (cont streams) (let ((streams1 (filter-map f streams))) (cont streams1))))) ;; This matches a sequence of patterns. (define (seq p1 p2) (lambda (cont streams) (if (null? streams) (cont streams) (p1 (lambda (streams1) (if (null? streams1) (cont streams1) (p2 cont streams1))) streams)))) ;; This matches either one of two patterns. It's analogous to patterns ;; separated by the '|' in regular expressions. (define (bar p1 p2) (lambda (cont streams) (if (null? streams) (cont streams) (let ((cont1 (lambda (streams1) (if (lset<= stream<= streams1 streams) (p2 cont streams) (cont streams1) )))) (p1 cont1 streams))))) ;; Kleene closure. Analogous to '*' (define (star p) (lambda (cont streams) (if (null? streams) (cont streams) (let ((cont1 (lambda (streams1) (cont (concatenate (list streams streams1)))))) (p (lambda (streams1) (cond ((lset<= stream<= streams1 streams) (cont streams)) (else ((star p) cont1 streams1)))) streams))))) ;; If parser p fails (returns an empty list), then invoke parser q on ;; the original stream, otherwise call r on the result of p. (define (redo p q r) (lambda (cont streams) (let ((cont1 (lambda (streams1) (if (null? streams1) (q cont streams) (r cont streams1))))) (p cont1 streams)))) ;; The rest of these are built from the previous five and are provided ;; for convenience. ;; this parser always succeeds (define (pass cont s) (cont s)) ;; Positive closure. Analogous to '+' (define (pos pat) (seq pat (star pat))) ;; Optional pattern. Analogous to '?' (define (opt pat) (bar pat pass)) ;; Converts a binary predicate procedure to a binary procedure that ;; returns its right argument when the predicate is true, and false ;; otherwise. (define (try p) (lambda (x y) (let ((res (p x y))) (and res y)))) ;; Matches a single character (define (char c) (tok c (try char=?))) ;; Matches any of a SRFI-14 set of characters. (define (set s) (let ((cs (if (char-set? s) s (list->char-set (if (string? s) (string->list s) s))))) (tok cs (try char-set-contains?)))) ;; Range of characters. Analogous to character class '[]' (define (range a b) (if (charchar-set (char->integer a) (+ 1 (char->integer b)))))) ;; Matches a consecutive list of patterns (define (lst ps) (let ((ps (reverse ps))) (let loop ((ps (cdr ps)) (p1 (car ps))) (cond ((null? ps) p1) (else (loop (cdr ps) (seq (car ps) p1))))))) ;; Matches a literal string s (define (lit s) (let ((f (lambda (t) (tok t (try char=?))))) (lst (map f (if (string? s) (string->list s) s))))) ;; datatype used by bind and drop (define-record-type box (make-box contents) box? (contents box-contents )) (define box make-box) (define unbox box-contents) ;; Given a list (X_1 ... X_n), returns a list ( (X_1 ... X_(n-1)) X_n ) (define-inline (split-at-last x) (if (null? x) (list #f (list)) (let loop ((prev (list (car x))) (rest (cdr x))) (cond ((null? rest) (if (null? (cdr prev)) (list #f (car prev)) (list (reverse (cdr prev)) (car prev)))) (else (loop (cons (car rest) prev) (cdr rest))))))) ;; helpers for bind (define-inline (bind-apply f) (lambda (s) (cond ((pair? s) (let ((eaten (car s)) (food (cdr s))) (let* ((ep (split-at-last eaten)) (eaten1 (car ep)) (eaten0 (cadr ep))) (assert (box? eaten0)) (let* ((x (and (list? eaten1) (f eaten1))) (res (if x (cons (append x (unbox eaten0)) food) (cons (unbox eaten0) food)))) res)))) (else s)))) (define-inline (box-stream s) (cond ((pair? s) (let ((eaten (car s)) (food (cdr s))) (cons (list (box eaten)) food))) (else s))) ;; Binds a procedure f to the consumed tokens returned by p (define (bind f p) (lambda (cont ss) (let ((ss1 (map box-stream ss)) (cont1 (lambda (ss) (let ((ss1 (map (bind-apply f) ss))) (cont ss1))))) (p cont1 ss1)))) (define (drop p) (bind (lambda x #f) p)) ;; helpers for rebind (define-inline (rebind-apply g) (lambda (i s) (cond ((pair? s) (let ((eaten (car s)) (food (cdr s))) (let* ((ep (split-at-last eaten)) (eaten1 (car ep)) (eaten0 (cadr ep))) (assert (box? eaten0)) (let* ((x (and (list? eaten1) (g i eaten1))) (res (if x (cons (append x (unbox eaten0)) food) (cons (unbox eaten0) food)))) res)))) (else s)))) ;; Applies a procedure f to the un-consumed tokens, then applies ;; procedure g to the result of f and the tokens returned by p (define (rebind f g p) (lambda (cont ss) (let* ((info (map (compose f cadr) ss)) (ss1 (map box-stream ss)) (cont1 (lambda (ss) (let ((ss1 (map (rebind-apply g) info ss))) (cont ss1))))) (p cont1 ss1)))) ;; Takes the resulting streams produced by the application of a ;; pattern on a stream (or streams) and selects the longest match if ;; one exists. (define-inline (longest0 streams) (let ((count+stream (fold (lambda (stream max) (cond ((and (pair? stream) (pair? max)) (let ((eaten (car stream)) (max-count (car max)) (max-stream (cadr max))) (if (< max-count (length eaten)) (list (length eaten) stream) max))) (else (error 'longest "invalid stream" stream)))) (list 0 `(() ())) streams))) (and (positive? (car count+stream)) (cadr count+stream)))) (define (longest p) (lambda (cont s) (p (lambda (s1) (if (or (null? s1) (null? (cdr s1))) (cont s1) (let ((res (longest0 s1))) (cont (list res))))) s))) ;; This takes a pattern and a string, turns the string into a list of ;; streams (containing one stream), applies the pattern, and returns ;; the longest match. (define (->char-list s) (if (string? s) (list (string->list s)) s)) (define (lex pat error ss) (let* ((stream (cond ((string? ss) `((() . ,(->char-list ss)))) ((pair? ss) ss) (else (error ss)))) (res (longest0 (pat (lambda (s1) (if (null? s1) (error ss) s1)) stream)))) (and res (list (reverse (first res)) (second res))))) ;; A helper procedure to transform streams from one format to ;; another. Procedure F must be a procedure of two arguments, a state ;; and a list of unconsumed characters. Procedure G is applied to an ;; unconsumed element, and is expected to return the original element ;; representation, before F was applied to the unconsumed stream. (define (stream-unfold init f g) (lambda (strm) (let ((l (length strm))) (cond ((fx= l 2) (let ((c (car strm)) (u (cadr strm))) (and (pair? u) (let ((h (car u)) (r (cdr u))) (letrec ((succ0 (lambda (r) (if (null? r) strm (let ((h1 (g (car r))) (r1 (f (car r) (cdr r)))) (list (cons h1 r1) succ0)))))) (list c (cons h (f init r)) succ0)))))) ((fx= l 3) (let ((c (car strm)) (u (cadr strm)) (succ (caddr strm))) (and (pair? u) (let ((h (car u)) (r (cdr u))) (letrec ((succ0 (lambda (r) (if (null? r) strm (let ((h1 (g (car r))) (r1 (f (car r) (succ (cdr r))))) (list (cons h1 r1) succ0)))))) (list c (cons h (f init r)) succ0)))))) (else #f))))) )